├── .gitignore
├── README.md
├── notebooks
├── Compare_APIs.ipynb
├── Compare_Timing_Logs.ipynb
├── GenGlobalTrust_indexed.ipynb
├── GenLocalTrust.ipynb
├── GenPersonalGraph.ipynb
├── README.md
├── debug_prod_graph.ipynb
├── igraph-engagement_addr.ipynb
├── igraph-engagement_fid.ipynb
├── requirements.txt
└── scripts_export
│ └── GenPersonalGraph.py
├── pipeline
├── .env.sample
├── Dockerfile
├── README.md
├── casts
│ ├── __init__.py
│ ├── cast_db_utils.py
│ ├── main.py
│ ├── main_fetch_top_casters.py
│ └── main_fetch_top_spammers.py
├── channels
│ ├── Bot_Fids.csv
│ ├── Seed_Fids.csv
│ ├── Top_Channels.csv
│ ├── Trending_Channels.csv
│ ├── __init__.py
│ ├── channel_db_utils.py
│ ├── channel_queries.py
│ ├── channel_utils.py
│ ├── main.py
│ ├── main_channel_rank.py
│ ├── main_fetch_channel_top_casters.py
│ ├── main_metrics.py
│ ├── main_notify_daily_trending.py
│ ├── main_notify_leaderboard.py
│ ├── main_notify_weekly_mods.py
│ ├── main_openrank.py
│ ├── main_points.py
│ ├── main_tokens.py
│ └── openrank_utils.py
├── config.py
├── crontab.txt
├── cura_utils.py
├── dag_utils
│ ├── clear_task_instance.py
│ ├── combine_csv.py
│ └── dune_backup.py
├── dags
│ ├── archived
│ │ ├── dag_automod.py
│ │ ├── dag_backup_sandbox_db.py
│ │ ├── dag_copy_graph_files_to_sandbox_dev_v1.py
│ │ ├── dag_degen_tips_processing.py
│ │ ├── dag_gen_personal_graph_replica_v0.py
│ │ ├── dag_insert_degen_ranking_v0.py
│ │ ├── dag_monitor_sandbox.py
│ │ ├── dag_run_frame_pipeline_v0.py
│ │ ├── degen
│ │ │ ├── calculate_rank.py
│ │ │ └── create_degen_sql_functions.py
│ │ ├── extractors
│ │ │ ├── dag_warpcast_channel_followers.py
│ │ │ ├── dag_warpcast_channel_members.py
│ │ │ └── dag_warpcast_channels.py
│ │ └── sandbox
│ │ │ ├── dag_sync_sandbox_casts.py
│ │ │ ├── dag_sync_sandbox_channel_fids.py
│ │ │ ├── dag_sync_sandbox_db_dev.py
│ │ │ ├── dag_sync_sandbox_globaltrust.py
│ │ │ └── dag_sync_sandbox_labels.py
│ ├── cura
│ │ ├── dag_direct_cast_join_requests.py
│ │ ├── dag_run_autoinvite_rules.py
│ │ └── dag_run_quote_casts.py
│ ├── dag_backup_to_s3_v1.py
│ ├── dag_copy_graph_files_to_replicas_v1.py
│ ├── dag_gen_channel_openrank.py
│ ├── dag_gen_channel_ranking_v3.py
│ ├── dag_gen_channel_ranking_v4.py
│ ├── dag_gen_globaltrust_v1.py
│ ├── dag_gen_personal_graph_replica_v1.py
│ ├── dag_notify_channel_daily_trending.py
│ ├── dag_notify_channel_leaderboard.py
│ ├── dag_notify_channel_weekly_mods.py
│ ├── dag_refresh_rank_view_v0.py
│ ├── dag_run_cast_pipeline_v0.py
│ ├── dag_update_channel_points.py
│ ├── dag_update_channel_tokens.py
│ ├── extractors
│ │ └── dag_cura_mod.py
│ ├── monitoring
│ │ ├── __init__.py
│ │ ├── dag_monitor_nindexer.py
│ │ └── dag_monitor_replication.py
│ ├── one_off
│ │ ├── .placeholder
│ │ ├── dag_gen_globaltrust_by_date_v0.py
│ │ ├── dag_gen_globaltrust_by_date_v1.py
│ │ ├── dag_insert_to_dune_table.py
│ │ ├── dag_migrate_dune_table.py
│ │ ├── dag_trial_branch.py
│ │ ├── dag_trial_sql.py
│ │ ├── dag_trial_task_groups.py
│ │ └── dag_trial_trigger.py
│ ├── pg_to_dune
│ │ ├── .env.sample
│ │ ├── app
│ │ │ └── check_last_timestamp.py
│ │ └── upload_to_dune.sh
│ ├── reports
│ │ ├── dag_gen_channel_metrics.py
│ │ └── dag_gen_labels.py
│ └── triggers
│ │ ├── trigger_gen_channel_ranking_v3.py
│ │ └── trigger_gen_channel_ranking_v4.py
├── db_utils.py
├── docker-compose.yaml
├── extractors
│ ├── automod_extractor.py
│ ├── channel_extractor_utils.py
│ ├── cura_mod_extractor.py
│ ├── extract_channel_data.sh
│ ├── extract_channel_fids.sh
│ ├── extract_cura_mod.sh
│ ├── main_channel_data.py
│ └── main_channel_fids.py
├── frames
│ ├── __init__.py
│ ├── frames_db_utils.py
│ ├── incremental_load_cast_mapping.sql
│ ├── incremental_load_labels.sql
│ ├── main.py
│ ├── scrape_utils.py
│ └── test_urls.py
├── globaltrust
│ ├── __init__.py
│ ├── compute.py
│ ├── export_localtrust_daily_stats.sql
│ ├── gen_globaltrust.py
│ ├── queries.py
│ └── test_data.py
├── go_eigentrust.py
├── graph
│ ├── __init__.py
│ ├── export_existingConnections_addr.sql
│ ├── export_existingConnections_fid.sql
│ ├── export_l1rep6rec3m12enhancedConnections_addr.sql
│ ├── export_l1rep6rec3m12enhancedConnections_fid.sql
│ ├── fetch_nodes_edges.py
│ ├── gen_igraph.py
│ ├── gen_personal_graph_amp.py
│ ├── gen_personal_graph_amp_v1.py
│ ├── graph_utils.py
│ ├── rechunk_graph_pqt.py
│ └── serve_igraph.py
├── igraph-docker-compose.yml
├── igraph.Dockerfile
├── igraph.nginx.conf
├── logs
│ └── .placeholder
├── plugins
│ ├── .placeholder
│ ├── __init__.py
│ └── hooks
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── discord.py
│ │ └── pagerduty.py
├── requirements.txt
├── run_cast_pipeline.sh
├── run_channel_metrics.sh
├── run_channel_openrank.sh
├── run_channel_scraper_v3.sh
├── run_channel_scraper_v4.sh
├── run_download_pqt_files_v1.sh
├── run_eigen2_postgres_sql.sh
├── run_eigen8_postgres_sql.sh
├── run_fetch_channel_top_caster.sh
├── run_fetch_top_caster.sh
├── run_fetch_top_spammers.sh
├── run_frame_scraper.sh
├── run_globaltrust_pipeline.sh
├── run_graph_pipeline.sh
├── run_notify_channel_daily_trending.sh
├── run_notify_channel_leaderboard.sh
├── run_notify_channel_weekly_mods.sh
├── run_personal_graph_pipeline_v1.sh
├── run_update_channel_points.sh
├── run_update_channel_tokens.sh
├── samples
│ ├── localtrust-engagement.csv
│ ├── localtrust-following.csv
│ └── pretrust.csv
├── schema
│ ├── globaltrust_config.sql
│ ├── k3l_objects.sql
│ ├── k3l_schema.sql
│ ├── neynar_db_schema.sql
│ ├── pretrust_v2.sql
│ └── replicator_db_schema.sql
├── scripts
│ ├── archived
│ │ ├── run_create_degen_db_functions.sh
│ │ ├── run_personal_graph_pipeline.sh
│ │ ├── run_sandbox_backup.sh
│ │ └── run_urlextract_pipeline.sh
│ └── one_off
│ │ ├── diff_db_table.py
│ │ ├── diff_json_api.py
│ │ └── run_cast_pipeline_gapfills.sh
├── sshtunnel.Dockerfile
├── timer.py
├── tmp
│ └── .placeholder
└── utils.py
├── scripts
├── .placeholder
└── certs
│ └── graphcast_jobs
│ ├── .env.sample
│ ├── README.md
│ ├── graph.cast.k3l.io
│ ├── graph.castN.k3l.io
│ ├── install_certs.sh
│ └── push_certs.sh
├── serve
├── .dockerignore
├── .env.sample
├── .gitignore
├── .idea
│ ├── .gitignore
│ ├── .name
│ ├── codeStyles
│ │ └── codeStyleConfig.xml
│ ├── dataSources.xml
│ ├── data_source_mapping.xml
│ ├── farcaster-graph-serve.iml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── sqldialects.xml
│ ├── vcs.xml
│ └── watcherTasks.xml
├── Dockerfile
├── README.md
├── app
│ ├── __init__.py
│ ├── config.py
│ ├── dependencies
│ │ ├── __init__.py
│ │ ├── cache_db_utils.py
│ │ ├── db_pool.py
│ │ ├── db_utils.py
│ │ ├── graph.py
│ │ ├── logging.py
│ │ └── memoize_utils.py
│ ├── graph_loader.py
│ ├── main.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── channel_model.py
│ │ ├── feed_model.py
│ │ ├── graph_model.py
│ │ └── score_model.py
│ ├── routers
│ │ ├── __init__.py
│ │ ├── cast_router.py
│ │ ├── channel_router.py
│ │ ├── direct_router.py
│ │ ├── frame_router.py
│ │ ├── globaltrust_router.py
│ │ ├── graph_router.py
│ │ ├── localtrust_router.py
│ │ ├── metadata_router.py
│ │ ├── token_router.py
│ │ └── user_router.py
│ ├── telemetry.py
│ └── utils.py
├── docker-compose.yml
├── poetry.lock
├── pyproject.toml
├── samples
│ ├── fc_90dv3_fid_SUCCESS
│ ├── fc_90dv3_fid_df.pkl
│ ├── fc_90dv3_fid_ig.pkl
│ ├── fc_engagement_SUCCESS
│ ├── fc_engagement_df.pkl
│ ├── fc_engagement_fid_SUCCESS
│ ├── fc_engagement_fid_df.pkl
│ ├── fc_engagement_fid_ig.pkl
│ ├── fc_engagement_idx.pkl
│ ├── fc_engagement_ig.pkl
│ ├── fc_following_SUCCESS
│ ├── fc_following_df.pkl
│ ├── fc_following_fid_SUCCESS
│ ├── fc_following_fid_df.pkl
│ ├── fc_following_fid_ig.pkl
│ ├── fc_following_idx.pkl
│ ├── fc_following_ig.pkl
│ ├── fid_scores.json
│ ├── lt_existingConnections_addr.csv
│ ├── lt_existingConnections_fid.csv
│ ├── lt_fboostedl1rep3rec6m12_90d_fid.csv
│ ├── lt_l1rep6rec3m12enhancedConnections_addr.csv
│ ├── lt_l1rep6rec3m12enhancedConnections_fid.csv
│ └── personal_graph.parquet
├── scratchpad.md
├── scripts
│ └── lint.sh
└── static
│ └── favicon.png
└── sql
├── counts_by_day.sql
├── counts_by_table.sql
├── k3l_requirements.sql
├── neynar-replica
├── .env.sample
├── Dockerfile
├── Dockerfile.alpine
├── Dockerfile.noble
├── docker-compose.yml
├── entrypoint.sh
├── pg_hba.conf
├── postgresql.conf
└── postgresql.conf.orig
├── replicator_drop_fk.sql
└── replicator_schema.sql
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | dist/
3 | .env
4 | .env.docker
5 | .*.credentials.json
6 | build/
7 | .venv
8 | *.pyc
9 | **/.ipynb_checkpoints
10 | **/.DS_Store
11 | **/lib/
12 | notebooks/data/
13 | **/pg_to_dune/csv
14 | pipeline/logs
15 | **/.vscode
16 | certificates
17 | **/tmp
18 | # Vim swap files
19 | .*.sw?
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Extract Graph-based insights from Farcaster
2 | The project is broken into three sub-projects:
3 |
4 | 1. `notebooks` - Jupyter notebooks for data exploration and prototyping graph queries.
5 | 2. `pipeline` - python scripts to generate graphs and dataframes that can be used to serve graph-based queries.
6 | 3. `serve` - FastAPI server to serve API requests for querying the graph from Farcaster.
7 |
8 | __NOTE__ For details on how to deploy an individual sub-project, check out the Readme docs under that sub-project.
9 |
10 |
--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Pre-requisites
2 | Assuming that you have Python and [pip](https://pip.pypa.io/en/stable/) installed on your system (maybe in a [virtualenv](https://docs.python.org/3/library/venv.html)), you need to `pip install -r requirements.txt`
3 |
4 | # Exploring the Notebooks
5 | Run `jupyter notebook` and explore the notebooks in your default browser.
--------------------------------------------------------------------------------
/notebooks/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | igraph
4 | niquests
5 | ipython
6 |
--------------------------------------------------------------------------------
/pipeline/.env.sample:
--------------------------------------------------------------------------------
1 | DB_HOST="ip.address.or.host"
2 | DB_PORT=5432
3 | DB_USER="usually_postgres"
4 | DB_NAME="db_name_like_lens_bigquery"
5 | DB_PASSWORD="db_password"
6 |
7 | REMOTE_DB_HOST="ip.address.or.host"
8 | REMOTE_DB_PORT=9541
9 |
10 | TBL_CHANNEL_FIDS='DANGER_deletemefordefault_or_changeme'
11 |
12 | PERSONAL_IGRAPH_INPUT='PATH_TO_IG_PKL'
13 | PERSONAL_IGRAPH_URL='CHANGE_THIS_URL'
14 |
15 | IS_TEST='false'
16 |
17 | AIRFLOW_UID=0
18 | AIRFLOW_GID=0
19 | AIRFLOW__CORE__FERNET_KEY='changeme'
20 |
21 | SSH_KEY_PATH="changeme"
22 | DUNE_API_KEY="changeme"
23 |
24 | # Safe Defaults
25 | POSTGRES_TIMEOUT_SECS=60
26 |
27 | GO_EIGENTRUST_URL='http://localhost:8080'
28 | GO_EIGENTRUST_TIMEOUT_MS=600000
29 | GO_EIGENTRUST_BIND_SRC='/tmp'
30 | GO_EIGENTRUST_BIND_TARGET='/tmp'
31 | GO_EIGENTRUST_FILE_MODE='false'
32 | EIGENTRUST_ALPHA=0.5
33 | EIGENTRUST_EPSILON=1.0
34 | EIGENTRUST_MAX_ITER=50
35 | EIGENTRUST_FLAT_TAIL=2
36 |
37 | FRAMES_NAP_SECS=10
38 | FRAMES_SLEEP_SECS=300
39 | FRAMES_BATCH_SIZE=1000
40 | FRAMES_SCRAPE_CONCURRENCY=10
41 | FRAMES_SCRAPE_CONNECT_TIMEOUT_SECS=5
42 | FRAMES_SCRAPE_READ_TIMEOUT_SECS=10
43 |
44 | CASTS_SLEEP_SECS=10
45 | CASTS_BATCH_LIMIT=100000
46 |
47 | WARPCAST_CHANNELS_TIMEOUT_SECS=5
48 | CHANNEL_SLEEP_SECS=1
49 |
50 |
51 | LOG_LEVEL='INFO'
52 | LOG_FORMAT='[%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(funcName)s ] %(message)s'
53 | LOGURU_FORMAT='{time:YYYY-MM-DD HH:mm:ss} | {module}:{file}:{function}:{line} | {level} | {message}'
54 | LOG_PATH='/tmp/'
55 |
--------------------------------------------------------------------------------
/pipeline/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:latest
2 | # Switch to root to install additional packages
3 | USER root
4 |
5 | # Fix potential permission issues and update package list
6 | RUN chmod -R a+rX /var/lib/apt/lists /var/cache/apt/archives && \
7 | apt-get clean && \
8 | rm -rf /var/lib/apt/lists/* && \
9 | mkdir -p /var/lib/apt/lists/partial && \
10 | apt-get update && \
11 | apt-get -y install zip
12 |
13 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
14 | RUN unzip awscliv2.zip
15 |
16 | RUN ./aws/install
17 |
18 | # Switch back to the airflow user
19 | USER airflow
20 |
21 | # Set working directory
22 | WORKDIR /pipeline
23 |
24 | # Copy only the necessary files for initial setup
25 | COPY requirements.txt /pipeline/requirements.txt
26 | COPY .env /pipeline/.env
27 |
28 | # Source environment variables
29 | RUN /bin/bash -c "source /pipeline/.env"
30 |
31 | RUN pip install --upgrade pip
32 |
33 | RUN pip install -r /pipeline/requirements.txt
34 | RUN pip install apache-airflow-providers-pagerduty==3.7.2 discord-webhook==1.3.1 apache-airflow-providers-ssh==3.11.2
35 |
36 |
37 |
--------------------------------------------------------------------------------
/pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Pre-requisites
2 | 1. Install [psql](https://www.timescale.com/blog/how-to-install-psql-on-mac-ubuntu-debian-windows/) on your local machine.
3 | 2. Run an instance of Postgres DB with data from Farcaster (installed locally or on a remote server)
4 | 3. Install [Python 3.12](https://www.python.org/downloads/)
5 | 4. Create a Python [virtualenv](https://docs.python.org/3/library/venv.html) somewhere on your machine - for example,`python3 -m venv .venv` will create a virtualenv in your current directory.
6 | 5. Copy/rename the `.env.sample` file into `.env` and update the details of the Postgres DB from step 2 and the virutalenv from step 3.
7 | 6. In case there is issues to create `.venv` add this code `rm -rf venv` `sudo apt install python3.12-venv`
8 |
9 | # Run the pipeline
10 | `sh run_pipeline.sh -w . -o /tmp/fc_graph`
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/pipeline/casts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/casts/__init__.py
--------------------------------------------------------------------------------
/pipeline/casts/main_fetch_top_casters.py:
--------------------------------------------------------------------------------
1 | # standard dependencies
2 | import sys
3 | from datetime import date
4 |
5 | # local dependencies
6 | from config import settings
7 | import utils
8 | from . import cast_db_utils
9 |
10 | # 3rd party dependencies
11 | from dotenv import load_dotenv
12 | from loguru import logger
13 | from sqlalchemy import create_engine
14 |
15 | logger.remove()
16 | level_per_module = {
17 | "": settings.LOG_LEVEL,
18 | "silentlib": False
19 | }
20 | logger.add(sys.stdout,
21 | colorize=True,
22 | format=settings.LOGURU_FORMAT,
23 | filter=level_per_module,
24 | level=0)
25 |
26 | def main():
27 | pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value()
28 | df = cast_db_utils.fetch_top_casters_df(logger, pg_dsn)
29 | # top_casters = []
30 | # for caster in casters:
31 | # top_casters.append({'i': caster['i'], 'v': caster['v']})
32 |
33 | # df = pd.DataFrame(data=top_casters)
34 | df["date_iso"] = date.today()
35 | logger.info(utils.df_info_to_string(df, with_sample=True))
36 |
37 | postgres_engine = create_engine(
38 | settings.ALT_POSTGRES_URL.get_secret_value(),
39 | connect_args={"connect_timeout": settings.POSTGRES_TIMEOUT_SECS * 1_000},
40 | )
41 | logger.info(postgres_engine)
42 | with postgres_engine.connect() as connection:
43 | df.to_sql('k3l_top_casters', con=connection, if_exists='append', index=False)
44 |
45 | # cast_db_utils.insert_dune_table(settings.DUNE_API_KEY, 'openrank', 'top_caster', df)
46 |
47 | logger.info('top casters data updated to DB')
48 |
49 | # end while loop
50 |
51 |
52 | if __name__ == "__main__":
53 | load_dotenv()
54 | print(settings)
55 |
56 | # parser = argparse.ArgumentParser(description='Fetch top casters, persist the dataframe to db')
57 | #
58 | # parser.add_argument('-u', '--user')
59 | # parser.add_argument('-p', '--password')
60 | # parser.add_argument('-e', '--endpoint')
61 | #
62 | # args = parser.parse_args()
63 |
64 | logger.info('hello hello')
65 | main()
66 |
--------------------------------------------------------------------------------
/pipeline/channels/Bot_Fids.csv:
--------------------------------------------------------------------------------
1 | FID,Username
2 | 262301,roundsbot
3 | 862591,cura-bot
4 | 864314,curabot
5 | 396644,hyperbot
6 | 861203,modbot
7 | 368422,automod
8 | 364927,paybot
--------------------------------------------------------------------------------
/pipeline/channels/Seed_Fids.csv:
--------------------------------------------------------------------------------
1 | channel id,Seed Peers FIDs
2 | superrare,"9480,9480, 190045, 12299, 346769, 374498, 513681, 270678, 368422,12299, 190045, 270678, 346769, 374498, 513681, 9480"
3 | build,"8446, 195255, 221216, 6730, 9856, 4461, 1214, 9816, 15732, 399485, 16085, 14351, 99"
4 | memes,"576, 3, 2, 3621, 239, 457, 347, 557, 4407, 1287, 1325"
5 | dev,"191, 6841"
6 | louder,"238853,15696, 206, 403020, 395131, 508334, 477292"
7 | wildcardclub,"4914, 7791"
8 |
--------------------------------------------------------------------------------
/pipeline/channels/Trending_Channels.csv:
--------------------------------------------------------------------------------
1 | ChannelID
2 | zora
3 | farcaster
4 | itookaphoto
5 | memes
6 | replyguys
7 | farville
8 | degen
9 | nature
10 | sense
11 | food
12 | jobs
13 | lifeisgood
14 | anime-manga
15 | football
16 | higher
17 | dickbutt
18 | art
19 | talent
20 | brypto
21 | dickbutt
22 | six
23 | vibely
24 | screens
25 | nba
--------------------------------------------------------------------------------
/pipeline/channels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/channels/__init__.py
--------------------------------------------------------------------------------
/pipeline/channels/main_metrics.py:
--------------------------------------------------------------------------------
1 | # standard dependencies
2 | import sys
3 | import argparse
4 | import datetime
5 |
6 | # local dependencies
7 | from config import settings
8 | from . import channel_db_utils
9 | from .channel_db_utils import Metric
10 |
11 | # 3rd party dependencies
12 | from dotenv import load_dotenv
13 | from loguru import logger
14 |
15 | # Configure logger
16 | logger.remove()
17 | level_per_module = {
18 | "": settings.LOG_LEVEL,
19 | "silentlib": False
20 | }
21 | logger.add(sys.stdout,
22 | colorize=True,
23 | format=settings.LOGURU_FORMAT,
24 | filter=level_per_module,
25 | level=0)
26 |
27 | load_dotenv()
28 |
29 | def main():
30 | # Metrics only available in Eigen 8
31 | pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value()
32 | sql_timeout_ms = 120_000
33 | channel_db_utils.upsert_weekly_metrics(logger, pg_dsn, sql_timeout_ms, Metric.WEEKLY_NUM_CASTS)
34 | channel_db_utils.upsert_weekly_metrics(logger, pg_dsn, sql_timeout_ms, Metric.WEEKLY_UNIQUE_CASTERS)
35 |
36 | if __name__ == "__main__":
37 |
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument(
40 | "--run",
41 | action="store_true",
42 | help="dummy arg to prevent accidental execution",
43 | required=True
44 | )
45 | parser.add_argument(
46 | "--dry-run",
47 | help="indicate dry-run mode",
48 | action="store_true"
49 | )
50 | args = parser.parse_args()
51 | print(args)
52 | logger.info(settings)
53 |
54 | if args.dry_run:
55 | settings.IS_TEST = True
56 |
57 | main()
--------------------------------------------------------------------------------
/pipeline/channels/openrank_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import subprocess
3 | import os
4 | import tempfile
5 |
6 | from config import settings
7 |
8 | from loguru import logger
9 |
10 | def download_results(req_id: str, toml_file: Path, out_dir:Path, out_file: Path):
11 | new_env = os.environ.copy()
12 | new_env['SECRET_KEY'] = settings.OPENRANK_REQ_SECRET_KEY.get_secret_value()
13 | get_cmd = subprocess.run(
14 | ["openrank-sdk", "get-results", str(req_id), str(toml_file), str(out_file)],
15 | stdout=subprocess.DEVNULL,
16 | stderr=subprocess.PIPE,
17 | text=True,
18 | timeout=settings.OPENRANK_TIMEOUT_SECS,
19 | env=new_env,
20 | check=True,
21 | )
22 | if get_cmd.returncode != 0:
23 | logger.error(f"OpenRank get-results failed for {req_id}: {get_cmd.stderr}")
24 | raise Exception("OpenRank get-results failed")
25 | logger.info(f"OpenRank get-results for {req_id} downloaded to: {out_file}")
26 |
27 | def update_and_compute(lt_file: Path, pt_file: Path, toml_file: Path) -> str:
28 | new_env = os.environ.copy()
29 | new_env['SECRET_KEY'] = settings.OPENRANK_REQ_SECRET_KEY.get_secret_value()
30 |
31 | lt_cmd = subprocess.run(
32 | ["openrank-sdk", "trust-update", str(lt_file), str(toml_file)],
33 | stdout=subprocess.PIPE,
34 | stderr=subprocess.STDOUT,
35 | text=True,
36 | # check=True, # we don't want to throw error until we have a chance to print the output
37 | timeout=settings.OPENRANK_TIMEOUT_SECS,
38 | env=new_env,
39 | )
40 | logger.info(f"OpenRank trust-update output: {lt_cmd}")
41 | if lt_cmd.returncode != 0:
42 | logger.error(f"OpenRank trust-update failed: {lt_cmd.stdout}")
43 | raise Exception("OpenRank trust-update failed")
44 | pt_cmd = subprocess.run(
45 | ["openrank-sdk", "seed-update", str(pt_file), str(toml_file)],
46 | stdout=subprocess.PIPE,
47 | stderr=subprocess.STDOUT,
48 | text=True,
49 | timeout=settings.OPENRANK_TIMEOUT_SECS,
50 | env=new_env,
51 | )
52 | logger.info(f"OpenRank seed-update output: {pt_cmd}")
53 | if pt_cmd.returncode != 0:
54 | logger.error(f"OpenRank seed-update failed: {pt_cmd.stdout}")
55 | raise Exception("OpenRank seed-update failed")
56 | compute_cmd = subprocess.run(
57 | ["openrank-sdk", "compute-request", str(toml_file)],
58 | stdout=subprocess.PIPE,
59 | stderr=subprocess.STDOUT,
60 | text=True,
61 | timeout=settings.OPENRANK_TIMEOUT_SECS,
62 | env=new_env,
63 | )
64 | logger.info(f"OpenRank compute output: {compute_cmd}")
65 | if compute_cmd.returncode != 0:
66 | logger.error(f"OpenRank compute failed: {compute_cmd.stdout}")
67 | raise Exception("OpenRank compute failed")
68 | req_id = compute_cmd.stdout.strip()
69 | logger.info(f"OpenRank request id: {req_id}")
70 | return req_id
71 |
--------------------------------------------------------------------------------
/pipeline/dag_utils/clear_task_instance.py:
--------------------------------------------------------------------------------
1 | from airflow import settings
2 | from airflow.models import DagRun, TaskInstance
3 | from airflow.utils.state import State
4 |
5 | # Define your variables
6 | dag_id = "gen_personal_graph_replica_v1"
7 | task_id = "process_channel_chunk"
8 | run_id = "manual__2024-07-22T06:46:15.813325+00:00"
9 | map_index_start = 908 # 908 430
10 | map_index_end = 939 # 939 907
11 |
12 | # Get the session
13 | session = settings.Session()
14 |
15 | # Query the DagRun
16 | dag_run = session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == run_id).one()
17 |
18 | # Loop through the range of map indexes and clear each task instance
19 | for map_index in range(map_index_start, map_index_end + 1):
20 | try:
21 | # Query the TaskInstance
22 | task_instance = session.query(TaskInstance).filter(
23 | TaskInstance.dag_id == dag_id,
24 | TaskInstance.task_id == task_id,
25 | TaskInstance.run_id == run_id,
26 | TaskInstance.map_index == map_index
27 | ).one()
28 |
29 | # Clear the task instance
30 | task_instance.set_state(State.SUCCESS, session=session)
31 | print(f"Cleared task {task_id} with map index {map_index} for DAG {dag_id} and run ID {run_id}")
32 | except Exception as e:
33 | print(f"Could not clear task {task_id} with map index {map_index}: {e}")
34 |
35 | # Commit the changes
36 | session.commit()
37 | print(f"Cleared tasks {task_id} with map indexes from {map_index_start} to {map_index_end} for DAG {dag_id} and run ID {run_id}")
38 |
39 |
--------------------------------------------------------------------------------
/pipeline/dag_utils/combine_csv.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import re
4 |
5 | # Specify the directory containing the CSV files
6 | directory = 'backup/'
7 |
8 | # Specify the output file
9 | output_file = 'combined_dataset.csv'
10 |
11 | # Function to extract numeric offset from filename
12 | def extract_offset(filename):
13 | match = re.search(r'offset_(\d+)', filename)
14 | return int(match.group(1)) if match else 0
15 |
16 | # Get list of files sorted by numeric offset
17 | files = sorted(
18 | (f for f in os.listdir(directory) if f.startswith('karma3-labs.dataset_k3l_cast_localtrust_offset_') and f.endswith('.csv')),
19 | key=extract_offset
20 | )
21 |
22 | # Initialize a flag to handle headers
23 | header_saved = False
24 |
25 | # Open the output file in write mode
26 | with open(output_file, 'w', newline='') as outfile:
27 | csv_writer = csv.writer(outfile)
28 |
29 | # Iterate over each sorted file
30 | for filename in files:
31 | file_path = os.path.join(directory, filename)
32 |
33 | # Open each CSV file in read mode
34 | with open(file_path, 'r') as infile:
35 | csv_reader = csv.reader(infile)
36 |
37 | # Iterate over the rows in the input file
38 | for i, row in enumerate(csv_reader):
39 | # Write the header only once
40 | if i == 0:
41 | if not header_saved:
42 | csv_writer.writerow(row)
43 | header_saved = True
44 | else:
45 | # Skip empty rows
46 | if any(cell.strip() for cell in row):
47 | csv_writer.writerow(row)
48 |
49 | print(f'Combined CSV file saved as {output_file}')
--------------------------------------------------------------------------------
/pipeline/dag_utils/dune_backup.py:
--------------------------------------------------------------------------------
1 | import urllib3
2 | from concurrent.futures import ThreadPoolExecutor, as_completed
3 |
4 | import time
5 | import random
6 |
7 | http = urllib3.PoolManager()
8 |
9 | def download_csv(limit: int, offset: int, table_name: str):
10 | """
11 | Download CSV data from the backend server.
12 |
13 | Args:
14 | endpoint (str): The endpoint for the download.
15 |
16 | Returns:
17 | List[dict]: List of downloaded data.
18 |
19 | Example:
20 | data = et._download_csv('localtrust/123')
21 | """
22 | print(f'limt={limit}, offset={offset}')
23 | jitter = random.uniform(0.01, 1)
24 | time.sleep(jitter)
25 |
26 | response = http.request(
27 | 'GET',
28 | f'https://api.dune.com/api/v1/query/3832819/results/csv?limit={limit}&offset={offset}',
29 | headers={
30 | 'Accept': 'text/csv',
31 | 'Content-Type':'text/csv',
32 | 'X-DUNE-API-KEY': '7QYqrqNvGVJJuwMybzxfh1sbR8qXFbDI',
33 | },
34 | preload_content=False
35 | )
36 | if response.status != 200:
37 | raise Exception(f"Failed to download CSV: {response.data.decode('utf-8')}")
38 |
39 | # data = response.data.decode('utf-8')
40 | # print(data)
41 | filename = f'backup/{table_name}_offset_{offset}_limit_{limit}.csv'
42 | with open(filename, 'wb') as out_file:
43 | # print(data)
44 | # data = response.read() # a `bytes` object
45 | out_file.write(response.data)
46 |
47 | # shutil.copyfileobj(response, out_file)
48 | # out_file.write(response)
49 | print(f'wrote {filename}')
50 |
51 |
52 |
53 | limit = 30000
54 | # next = limit
55 | offset = 0
56 |
57 | start = 0
58 | stop = 382500000
59 | step = limit
60 | incremental_array = list(range(start, stop + step, step))
61 |
62 | # print(incremental_array[:100])
63 | num_workers = 25
64 | table_name = "karma3-labs.dataset_k3l_cast_localtrust"
65 | # Use ThreadPoolExecutor to make parallel HTTP requests
66 | with ThreadPoolExecutor(max_workers=num_workers) as executor:
67 | future_to_value = {executor.submit(download_csv, limit, value, table_name): value for value in incremental_array}
68 |
69 | for future in as_completed(future_to_value):
70 | value = future_to_value[future]
71 | try:
72 | future.result()
73 | except Exception as exc:
74 | print(f'Value {value} generated an exception: {exc}')
75 |
76 | print("All requests completed.")
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_automod.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.operators.bash_operator import BashOperator
3 | from airflow.models import Variable
4 | from hooks.discord import send_alert_discord
5 | from hooks.pagerduty import send_alert_pagerduty
6 | from datetime import datetime, timedelta
7 |
8 |
9 | api_key = Variable.get("API_KEY", default_var="api_key")
10 | db_endpoint = Variable.get('DB_ENDPOINT', default_var="test")
11 | db_user = Variable.get('DB_USER', default_var="test")
12 | db_password = Variable.get('DB_PASSWORD', default_var="test")
13 |
14 |
15 | default_args = {
16 | 'owner': 'coder2j',
17 | 'retries': 1,
18 | 'retry_delay': timedelta(minutes=5)
19 | }
20 |
21 | with DAG(
22 | 'extract_automod_api_to_db',
23 | default_args=default_args,
24 | description='Fetch data from AUTOMOD API and load into DB daily',
25 | # schedule_interval=timedelta(days=1),
26 | schedule_interval=None,
27 | start_date=datetime(2024, 9, 4),
28 | is_paused_upon_creation=True,
29 | max_active_runs=1,
30 | catchup=False,
31 | ) as dag:
32 | fetch_data_from_automod = BashOperator(
33 | task_id='fetch_automod_data_from_api',
34 | bash_command=f"cd /pipeline/extractors ; python3 automod_extractor.py {api_key} { db_user } { db_password } { db_endpoint }"
35 | )
36 |
37 | fetch_data_from_automod
38 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_backup_sandbox_db.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 | from airflow.operators.bash import BashOperator
8 |
9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 |
12 | default_args = {
13 | 'owner': 'coder2j',
14 | 'retries': 5,
15 | 'retry_delay': timedelta(minutes=2),
16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 |
19 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
20 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
21 |
22 | with DAG(
23 | dag_id='dag_backup_sandbox_db_v0',
24 | default_args=default_args,
25 | description='sync the db table of the sandboxed read replica',
26 | start_date=datetime(2024, 8, 10, 18),
27 | # schedule_interval='0 0 * * *', # backup everyday
28 | schedule_interval=None, # backup everyday
29 | catchup=False,
30 | ) as dag:
31 |
32 |
33 | # ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
34 |
35 | # run_sandbox_backup = SSHOperator(
36 | # task_id="run_sandbox_backup_v0",
37 | # command=f"cd {sandbox_db_sync_path}; ./run-backup.sh ",
38 | # ssh_hook=ssh_hook,
39 | # dag=dag)
40 |
41 | run_sandbox_backup = BashOperator(
42 | task_id='run_sandbox_backup',
43 | bash_command="cd /pipeline && ./run_sandbox_backup.sh "
44 | )
45 |
46 | run_sandbox_backup
47 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_copy_graph_files_to_sandbox_dev_v1.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.operators.bash import BashOperator
6 | from airflow.providers.ssh.operators.ssh import SSHHook
7 | from airflow.providers.ssh.operators.ssh import SSHOperator
8 | from airflow.sensors.external_task import ExternalTaskSensor
9 |
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 |
13 | default_args = {
14 | "owner": "coder2j",
15 | "retries": 5,
16 | "retry_delay": timedelta(minutes=2),
17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 |
20 | dev_sandbox_pipeline_path = Variable.get("dev_sandbox_pipeline_path")
21 | data_backup_s3_bucket = Variable.get("data_backup_s3_bucket")
22 |
23 | with DAG(
24 | dag_id="copy_graph_files_to_sandbox_dev_v2",
25 | default_args=default_args,
26 | description="re-generate graph for farcaster-graph API server. copy re-generated all graph files to dev sandbox from backup s3",
27 | start_date=datetime(2024, 7, 9, 18),
28 | # schedule_interval="0 0 * * *",
29 | schedule_interval=None,
30 | is_paused_upon_creation=True,
31 | max_active_runs=1,
32 | catchup=False,
33 | ) as dag:
34 |
35 | ssh_hook = SSHHook(ssh_conn_id='sandbox_staging', keepalive_interval=60, cmd_timeout=None)
36 |
37 | download_pqt_file = SSHOperator(
38 | task_id="download_pqt_file_v1",
39 | command=f"cd {dev_sandbox_pipeline_path}; ./run_graph_pipeline.sh -o /data/serve_files -s {data_backup_s3_bucket} ",
40 | ssh_hook=ssh_hook,
41 | dag=dag,
42 | )
43 |
44 | download_pqt_file
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_degen_tips_processing.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from airflow import DAG
3 | from airflow.operators.bash import BashOperator
4 | from airflow.operators.python import PythonOperator
5 | from hooks.discord import send_alert_discord
6 | from hooks.pagerduty import send_alert_pagerduty
7 |
8 | default_args = {
9 | 'owner': 'coder2j',
10 | 'retries': 5,
11 | 'retry_delay': timedelta(minutes=2),
12 | # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
13 | }
14 |
15 | with DAG(
16 | dag_id='dag_degen_tips_processing_v0',
17 | default_args=default_args,
18 | description='Process DEGEN tips from casts',
19 | start_date=datetime(2024, 7, 9, 18),
20 | # schedule_interval='*/10 * * * *', # Run every 10 minutes
21 | schedule_interval=None,
22 | catchup=False,
23 | ) as dag:
24 | task_update_degen_tips = BashOperator(
25 | task_id='update_degen_tips_v0',
26 | bash_command='''cd /pipeline/ && ./run_create_degen_db_functions.sh -v .venv -t extract
27 | '''
28 | )
29 |
30 | task_analyze_degen_tips = BashOperator(
31 | task_id='analyze_degen_tips_v0',
32 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
33 | ANALYZE k3l_degen_tips;
34 | ANALYZE k3l_cast_action;"
35 | '''
36 | )
37 |
38 | # Set up the task dependencies
39 | task_update_degen_tips >> task_analyze_degen_tips
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_gen_personal_graph_replica_v0.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'coder2j',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 | with DAG(
19 | dag_id='gen_personal_graph_replica_v0',
20 | default_args=default_args,
21 | description='Every hour, try running personal graph script on eigen7 replica. Script has internal check for 36 hours',
22 | start_date=datetime(2024, 7, 9, 18),
23 | # schedule_interval='0 * * * *',
24 | schedule_interval=None,
25 | catchup=False,
26 | ) as dag:
27 | ssh_hook = SSHHook(ssh_conn_id='eigen7', keepalive_interval=60, cmd_timeout=None)
28 |
29 | eigen7_copy_localtrust_csv_files = SSHOperator(
30 | task_id="eigen7_gen_personal_graph",
31 | command=f"cd ~/farcaster-graph/pipeline; ./run_personal_graph_pipeline.sh -i ~/serve_files/lt_l1rep6rec3m12enhancedConnections_fid.csv -o ~/wip_files/ -w . -v .venv -s k3l-openrank-farcaster -l /var/log/farcaster-graph/ ",
32 | ssh_hook=ssh_hook,
33 | dag=dag)
34 |
35 | eigen7_copy_localtrust_csv_files
36 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_insert_degen_ranking_v0.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from airflow import DAG
3 | from airflow.operators.bash import BashOperator
4 | from airflow.operators.python import PythonOperator
5 | from hooks.discord import send_alert_discord
6 | from hooks.pagerduty import send_alert_pagerduty
7 |
8 | default_args = {
9 | 'owner': 'coder2j',
10 | 'retries': 5,
11 | 'retry_delay': timedelta(minutes=2),
12 | # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
13 | }
14 |
15 | with DAG(
16 | dag_id='dag_degen_insert_ranking_v0',
17 | default_args=default_args,
18 | description='Process DEGEN tips from casts',
19 | start_date=datetime(2024, 7, 9, 18),
20 | # schedule_interval='10 */6 * * *',
21 | schedule_interval=None,
22 | catchup=False,
23 | ) as dag:
24 |
25 | task_update_degen_tips = BashOperator(
26 | task_id='update_degen_tips_v0',
27 | bash_command='''cd /pipeline/ && ./run_create_degen_db_functions.sh -v .venv -t insert_scores
28 | '''
29 | )
30 |
31 | task_analyze_degen_tips = BashOperator(
32 | task_id='analyze_degen_tips_v0',
33 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
34 | ANALYZE k3l_degen_tips;
35 | ANALYZE k3l_cast_action;"
36 | '''
37 | )
38 |
39 | # Set up the task dependencies
40 | task_update_degen_tips >> task_analyze_degen_tips
--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_run_frame_pipeline_v0.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 |
6 | from hooks.discord import send_alert_discord
7 | from hooks.pagerduty import send_alert_pagerduty
8 |
9 | default_args = {
10 | 'owner': 'coder2j',
11 | 'retries': 5,
12 | 'retry_delay': timedelta(minutes=2),
13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 |
16 | with DAG(
17 | dag_id='extract_frame_url_v0',
18 | default_args=default_args,
19 | description='Extract urls from cast embeds for frames and refresh pg statistics',
20 | start_date=datetime(2024, 7, 9, 18),
21 | # schedule_interval='1-59/20 * * * *',
22 | # Decommission Frames ranking due to lack of usage
23 | # ... and relevance with the introduction of Frames V2 by Warpcast
24 | # schedule_interval=timedelta(minutes=20),
25 | schedule_interval=None,
26 | is_paused_upon_creation=True,
27 | max_active_runs=1,
28 | catchup=False,
29 | ) as dag:
30 | task1 = BashOperator(
31 | task_id='run_urlextract_pipeline',
32 | bash_command='cd /pipeline/ && ./run_urlextract_pipeline.sh -w . '
33 | )
34 |
35 | task2 = BashOperator(
36 | task_id='run_frame_scraper',
37 | bash_command='cd /pipeline/ && ./run_frame_scraper.sh -v ./.venv/ '
38 | )
39 |
40 | task3 = BashOperator(
41 | task_id='analyze_url_labels_and_mapping',
42 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
43 | ANALYZE k3l_url_labels; ANALYZE k3l_cast_embed_url_mapping;"
44 | '''
45 | )
46 |
47 | task4 = BashOperator(
48 | task_id='refresh_k3l_frame_interaction',
49 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
50 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_frame_interaction;"
51 | '''
52 | )
53 |
54 | # task5 = BashOperator(
55 | # task_id='vacuum_k3l_frame_interaction',
56 | # bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
57 | # VACUUM ANALYZE k3l_recent_frame_interaction;"
58 | # '''
59 | # )
60 |
61 | # task1 >> task2 >> task3 >> task4 >> task5
62 | task1 >> task2 >> task3 >> task4
63 |
64 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/extractors/dag_warpcast_channel_followers.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 |
6 | from hooks.discord import send_alert_discord
7 | from hooks.pagerduty import send_alert_pagerduty
8 |
9 | default_args = {
10 | "owner": "karma3labs",
11 | "retries": 1,
12 | "retry_delay": timedelta(minutes=5),
13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 |
16 | with DAG(
17 | "extract_warpcast_followers",
18 | default_args=default_args,
19 | description="Fetch channel followers from WARPCAST API and load into DB daily",
20 | schedule_interval=timedelta(days=1),
21 | start_date=datetime(2024, 8, 1),
22 | is_paused_upon_creation=True,
23 | max_active_runs=1,
24 | catchup=False,
25 | ) as dag:
26 |
27 | prep_task = BashOperator(
28 | task_id='prep_warpcast_followers',
29 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t prep"
30 | " -w . -v .venv -j followers",
31 | dag=dag
32 | )
33 |
34 | fetch_task = BashOperator(
35 | task_id='extract_channel_followers',
36 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t fetch"
37 | " -w . -v .venv -c channels/Top_Channels.csv -s top -j followers",
38 | dag=dag
39 | )
40 |
41 | cleanup_task = BashOperator(
42 | task_id='cleanup_warpcast_followers',
43 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t cleanup"
44 | " -w . -v .venv -j followers",
45 | dag=dag
46 | )
47 |
48 | prep_task >> fetch_task >> cleanup_task
--------------------------------------------------------------------------------
/pipeline/dags/archived/extractors/dag_warpcast_channel_members.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 |
6 | from hooks.discord import send_alert_discord
7 | from hooks.pagerduty import send_alert_pagerduty
8 |
9 | default_args = {
10 | "owner": "karma3labs",
11 | "retries": 1,
12 | "retry_delay": timedelta(minutes=5),
13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 |
16 | with DAG(
17 | "extract_warpcast_members",
18 | default_args=default_args,
19 | description="Fetch channel members from WARPCAST API and load into DB daily",
20 | schedule_interval=timedelta(hours=1),
21 | start_date=datetime(2024, 8, 1),
22 | is_paused_upon_creation=True,
23 | max_active_runs=1,
24 | catchup=False,
25 | ) as dag:
26 |
27 | prep_task = BashOperator(
28 | task_id='prep_warpcast_members',
29 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t prep"
30 | " -w . -v .venv -j members",
31 | dag=dag
32 | )
33 |
34 | fetch_task = BashOperator(
35 | task_id='fetch_warpcast_members',
36 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t fetch"
37 | " -w . -v .venv -c channels/Top_Channels.csv -s top -j members",
38 | dag=dag
39 | )
40 |
41 | cleanup_task = BashOperator(
42 | task_id='cleanup_warpcast_members',
43 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t cleanup"
44 | " -w . -v .venv -j members",
45 | dag=dag
46 | )
47 |
48 | prep_task >> fetch_task >> cleanup_task
--------------------------------------------------------------------------------
/pipeline/dags/archived/extractors/dag_warpcast_channels.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.operators.bash_operator import BashOperator
3 | from airflow.models import Variable
4 | from hooks.discord import send_alert_discord
5 | from hooks.pagerduty import send_alert_pagerduty
6 | from datetime import datetime, timedelta
7 |
8 | db_endpoint = Variable.get('DB_ENDPOINT', default_var="test")
9 | db_user = Variable.get('DB_USER', default_var="test")
10 | db_password = Variable.get('DB_PASSWORD', default_var="test")
11 |
12 |
13 | default_args = {
14 | 'owner': 'coder2j',
15 | 'retries': 1,
16 | 'retry_delay': timedelta(minutes=5)
17 | }
18 |
19 | with DAG(
20 | 'extract_warpcast_channels',
21 | default_args=default_args,
22 | description='Fetch channels metadata from WARPCAST API and load into DB daily',
23 | schedule_interval=timedelta(days=1),
24 | start_date=datetime(2024, 8, 19),
25 | is_paused_upon_creation=True,
26 | max_active_runs=1,
27 | catchup=False,
28 | ) as dag:
29 | fetch_data_from_warpcast = BashOperator(
30 | task_id='fetch_warpcast_data_from_api',
31 | bash_command="cd /pipeline; extractors/extract_channel_data.sh"
32 | " -w . -v .venv ",
33 | dag=dag
34 | )
35 |
36 | fetch_data_from_warpcast
37 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_casts.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'coder2j',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
20 |
21 | with DAG(
22 | dag_id='sync_sandbox_db_casts',
23 | default_args=default_args,
24 | description='sync cast actions and parent casts to the sandbox',
25 | start_date=datetime(2024, 7, 10, 18),
26 | # schedule_interval='*/10 * * * *',
27 | # schedule_interval=timedelta(minutes=5),
28 | schedule=None,
29 | is_paused_upon_creation=True,
30 | max_active_runs=1,
31 | catchup=False,
32 | ) as dag:
33 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
34 |
35 | run_append = SSHOperator(
36 | task_id="run_append_v1",
37 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh ",
38 | ssh_hook=ssh_hook,
39 | dag=dag)
40 |
41 | run_remove = SSHOperator(
42 | task_id="run_remove_v0",
43 | command=f"cd {sandbox_db_sync_path}; ./2-run-remove.sh ",
44 | ssh_hook=ssh_hook,
45 | dag=dag)
46 |
47 | run_append >> run_remove
48 |
49 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_channel_fids.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'coder2j',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 |
20 | with DAG(
21 | dag_id='sync_sandbox_channel_fids',
22 | default_args=default_args,
23 | description='sync globaltrust to the sandbox',
24 | start_date=datetime(2024, 7, 10, 18),
25 | # schedule_interval='*/10 * * * *',
26 | schedule=None,
27 | is_paused_upon_creation=True,
28 | max_active_runs=1,
29 | catchup=False,
30 | ) as dag:
31 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
32 |
33 | run_append = SSHOperator(
34 | task_id="run_append_v1",
35 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -c ",
36 | ssh_hook=ssh_hook,
37 | dag=dag)
38 |
39 | run_refresh = SSHOperator(
40 | task_id="run_refresh_v0",
41 | command=f"cd {sandbox_db_sync_path}; ./4-run-refresh.sh -c ",
42 | ssh_hook=ssh_hook,
43 | dag=dag)
44 |
45 | run_append >> run_refresh
46 |
47 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_db_dev.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'coder2j',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
19 |
20 | with DAG(
21 | dag_id='dag_sync_sandbox_db_dev_v0',
22 | default_args=default_args,
23 | description='sync the db table of the sandboxed read replica',
24 | start_date=datetime(2024, 7, 10, 18),
25 | # schedule_interval='*/10 * * * *',
26 | schedule_interval=None,
27 | catchup=False,
28 | ) as dag:
29 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
30 |
31 | run_append_dev = SSHOperator(
32 | task_id="run_append_dev_v0",
33 | command=f"cd {dev_sandbox_db_sync_path}; ./1-run-append.sh -d 5 ",
34 | ssh_hook=ssh_hook,
35 | dag=dag)
36 |
37 | run_remove_dev = SSHOperator(
38 | task_id="run_remove_dev_v0",
39 | command=f"cd {dev_sandbox_db_sync_path}; ./2-run-remove.sh ",
40 | ssh_hook=ssh_hook,
41 | dag=dag)
42 |
43 | run_append_dev >> run_remove_dev
44 |
45 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_globaltrust.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'coder2j',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 |
20 | with DAG(
21 | dag_id='sync_sandbox_globaltrust',
22 | default_args=default_args,
23 | description='sync globaltrust to the sandbox',
24 | start_date=datetime(2024, 7, 10, 18),
25 | # schedule_interval='*/10 * * * *',
26 | schedule=None,
27 | is_paused_upon_creation=True,
28 | max_active_runs=1,
29 | catchup=False,
30 | ) as dag:
31 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
32 |
33 | run_append = SSHOperator(
34 | task_id="run_append_v1",
35 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -g ",
36 | ssh_hook=ssh_hook,
37 | dag=dag)
38 |
39 | run_refresh = SSHOperator(
40 | task_id="run_refresh_v0",
41 | command=f"cd {sandbox_db_sync_path}; ./4-run-refresh.sh -g ",
42 | ssh_hook=ssh_hook,
43 | dag=dag)
44 |
45 | run_append >> run_refresh
46 |
47 |
--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_labels.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.contrib.operators.ssh_operator import SSHOperator
6 | from airflow.contrib.hooks.ssh_hook import SSHHook
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'coder2j',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
20 |
21 | with DAG(
22 | dag_id='sync_sandbox_db_labels',
23 | default_args=default_args,
24 | description='sync labels to the sandbox',
25 | start_date=datetime(2024, 7, 10, 18),
26 | # schedule_interval='*/10 * * * *',
27 | schedule=None,
28 | is_paused_upon_creation=True,
29 | max_active_runs=1,
30 | catchup=False,
31 | ) as dag:
32 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
33 |
34 | run_append = SSHOperator(
35 | task_id="run_append_v1",
36 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -l",
37 | ssh_hook=ssh_hook,
38 | dag=dag)
39 |
40 | run_append
41 |
42 |
--------------------------------------------------------------------------------
/pipeline/dags/cura/dag_direct_cast_join_requests.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.operators.bash import BashOperator
6 | from airflow.providers.ssh.operators.ssh import SSHHook
7 | from airflow.providers.ssh.operators.ssh import SSHOperator
8 | from airflow.decorators import task_group
9 |
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 |
13 | default_args = {
14 | "owner": "coder2j",
15 | "retries": 5,
16 | "retry_delay": timedelta(minutes=2),
17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 |
20 | HOST_REPO_URL='cura-bot-2'
21 |
22 | with DAG(
23 | dag_id="cura_direct_cast_join_requests",
24 | default_args=default_args,
25 | description="Direct cast join requests from curabot",
26 | start_date=datetime(2024, 11, 7),
27 | schedule_interval='0 * * * *',
28 | is_paused_upon_creation=True,
29 | max_active_runs=1,
30 | catchup=False,
31 | ) as dag:
32 |
33 | ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None)
34 |
35 | eigen1_install_dependencies = SSHOperator(
36 | task_id="cura_eigen1_install_deps",
37 | command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i",
38 | ssh_hook=ssh_hook,
39 | dag=dag,
40 | )
41 |
42 | eigen1_direct_cast_join_requests = SSHOperator(
43 | task_id="cura_eigen1_direct_cast_join_requests",
44 | command=f"cd {HOST_REPO_URL} && npm run script:direct_cast_join_requests",
45 | ssh_hook=ssh_hook,
46 | dag=dag,
47 | )
48 |
49 | eigen1_install_dependencies >> eigen1_direct_cast_join_requests
50 |
51 |
--------------------------------------------------------------------------------
/pipeline/dags/cura/dag_run_autoinvite_rules.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.operators.bash import BashOperator
6 | from airflow.providers.ssh.operators.ssh import SSHHook
7 | from airflow.providers.ssh.operators.ssh import SSHOperator
8 | from airflow.decorators import task_group
9 |
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 |
13 | default_args = {
14 | "owner": "coder2j",
15 | "retries": 5,
16 | "retry_delay": timedelta(minutes=2),
17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 |
20 | HOST_REPO_URL='cura-bot-3'
21 |
22 | with DAG(
23 | dag_id="cura_run_autoinvite_rules",
24 | default_args=default_args,
25 | description="Run all the autoinvite rules",
26 | start_date=datetime(2024, 11, 7),
27 | schedule_interval='0 */4 * * *',
28 | is_paused_upon_creation=True,
29 | max_active_runs=1,
30 | catchup=False,
31 | ) as dag:
32 |
33 | ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None)
34 |
35 | eigen1_install_dependencies = SSHOperator(
36 | task_id="cura_eigen1_install_deps",
37 | command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i",
38 | ssh_hook=ssh_hook,
39 | dag=dag,
40 | )
41 |
42 | eigen1_run_autoinvite = SSHOperator(
43 | task_id="cura_eigen1_run_autoinvite",
44 | command=f"cd {HOST_REPO_URL} && npm run script:autoinvite",
45 | ssh_hook=ssh_hook,
46 | dag=dag,
47 | )
48 |
49 | eigen1_install_dependencies >> eigen1_run_autoinvite
50 |
--------------------------------------------------------------------------------
/pipeline/dags/cura/dag_run_quote_casts.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.models import Variable
5 | from airflow.operators.bash import BashOperator
6 | from airflow.providers.ssh.operators.ssh import SSHHook
7 | from airflow.providers.ssh.operators.ssh import SSHOperator
8 | from airflow.decorators import task_group
9 |
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 |
13 | default_args = {
14 | "owner": "coder2j",
15 | "retries": 5,
16 | "retry_delay": timedelta(minutes=2),
17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 |
20 | HOST_REPO_URL='cura-bot-1'
21 |
22 | with DAG(
23 | dag_id="cura_run_quote_casts",
24 | default_args=default_args,
25 | description="Quote a cast and post it from curabot",
26 | start_date=datetime(2024, 11, 7),
27 | schedule_interval='0 0 * * 5',
28 | is_paused_upon_creation=True,
29 | max_active_runs=1,
30 | catchup=False,
31 | ) as dag:
32 |
33 | ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None)
34 |
35 | eigen1_install_dependencies = SSHOperator(
36 | task_id="cura_eigen1_install_deps",
37 | command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i",
38 | ssh_hook=ssh_hook,
39 | dag=dag,
40 | )
41 |
42 | eigen1_run_quote_casts = SSHOperator(
43 | task_id="cura_eigen1_run_quote_casts",
44 | command=f"cd {HOST_REPO_URL} && npm run script:quote_casts",
45 | ssh_hook=ssh_hook,
46 | dag=dag,
47 | )
48 |
49 | eigen1_install_dependencies >> eigen1_run_quote_casts
50 |
51 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_backup_to_s3_v1.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from airflow.sensors.external_task import ExternalTaskSensor
6 |
7 | from hooks.discord import send_alert_discord
8 | from hooks.pagerduty import send_alert_pagerduty
9 |
10 | default_args = {
11 | 'owner': 'coder2j',
12 | 'retries': 5,
13 | 'retry_delay': timedelta(minutes=2),
14 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
15 | }
16 |
17 |
18 | with DAG(
19 | dag_id='backup_to_s3_v1',
20 | default_args=default_args,
21 | description='This backs up globaltrust, localtrust and channel_ranking into s3',
22 | start_date=datetime(2024, 8, 15),
23 | schedule_interval='30 20 * * *',
24 | catchup=False,
25 | ) as dag:
26 |
27 | task1 = BashOperator(
28 | task_id='backup_globaltrust',
29 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh globaltrust"
30 | )
31 |
32 | task2 = BashOperator(
33 | task_id='backup_globaltrust_config',
34 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh globaltrust_config"
35 | )
36 |
37 | task3 = BashOperator(
38 | task_id='backup_localtrust',
39 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh localtrust_v1 /pipeline/tmp/graph_files"
40 | )
41 |
42 | [task1, task2, task3]
43 |
44 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_notify_channel_daily_trending.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | import pytz
3 |
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 | from airflow.operators.empty import EmptyOperator
7 | from airflow.decorators import task
8 |
9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 |
12 | default_args = {
13 | 'owner': 'karma3labs',
14 | 'retries': 5,
15 | 'retry_delay': timedelta(minutes=2),
16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 |
19 | def _9ampacific_in_utc_time():
20 | pacific_tz = pytz.timezone('US/Pacific')
21 | pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
22 | pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
23 | utc_time = pacific_time.astimezone(pytz.utc)
24 | return utc_time
25 |
26 | with DAG(
27 | dag_id='notify_channel_daily_trending',
28 | default_args=default_args,
29 | description='daily notifications for trending channels',
30 | start_date=datetime(2024, 7, 10, 18),
31 | schedule_interval='30 16 * * *', # every day at 16:30/17:30 UTC / 09:30 Pacific
32 | is_paused_upon_creation=True,
33 | max_active_runs=1,
34 | catchup=False,
35 | ) as dag:
36 |
37 | skip_notify = EmptyOperator(task_id="skip_notify")
38 |
39 | notify = BashOperator(
40 | task_id="notify",
41 | bash_command=(
42 | "cd /pipeline && ./run_notify_channel_daily_trending.sh "
43 | " -w . -v .venv -c channels/Trending_Channels.csv "),
44 | dag=dag)
45 |
46 | @task.branch(task_id="check_last_successful")
47 | def check_last_successful(**context) -> bool:
48 | now = datetime.now(pytz.utc)
49 | prev_run_date = context['prev_data_interval_end_success']
50 | daily_run = _9ampacific_in_utc_time()
51 | print(f"now: {now}, prev_run_date: {prev_run_date}, daily_run: {daily_run}")
52 | if (
53 | now > daily_run
54 | and (prev_run_date is None or prev_run_date < daily_run)
55 | ):
56 | # Last successful run was before today, so we should run
57 | print(f"Last run {prev_run_date} was before {daily_run}, so we should run")
58 | return "notify"
59 | return "skip_notify"
60 |
61 | check_last_successful = check_last_successful()
62 |
63 | check_last_successful >> skip_notify
64 |
65 | check_last_successful >> notify
66 |
67 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_notify_channel_leaderboard.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | import pytz
3 |
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 | from airflow.operators.empty import EmptyOperator
7 | from airflow.decorators import task
8 |
9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 |
12 | default_args = {
13 | 'owner': 'karma3labs',
14 | 'retries': 5,
15 | 'retry_delay': timedelta(minutes=2),
16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 |
19 | def _monday_9ampacific_in_utc_time():
20 | pacific_tz = pytz.timezone('US/Pacific')
21 | pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
22 | pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
23 | utc_time = pacific_time.astimezone(pytz.utc)
24 | monday_utc_time = utc_time - timedelta(days=utc_time.weekday() - 0)
25 | return monday_utc_time
26 |
27 | with DAG(
28 | dag_id='notify_channel_leaderboard',
29 | default_args=default_args,
30 | description='channel notifications started by trigger dag or manually',
31 | start_date=datetime(2024, 7, 10, 18),
32 | schedule_interval=None,
33 | is_paused_upon_creation=True,
34 | max_active_runs=1,
35 | catchup=False,
36 | ) as dag:
37 |
38 | skip_notify = EmptyOperator(task_id="skip_notify")
39 |
40 | notify = BashOperator(
41 | task_id="notify",
42 | bash_command="cd /pipeline && ./run_notify_channel_leaderboard.sh -w . -v .venv -r ",
43 | dag=dag)
44 |
45 | @task.branch(task_id="check_last_successful")
46 | def check_last_successful(**context) -> bool:
47 | now = datetime.now(pytz.utc)
48 | prev_run_date = context['prev_data_interval_start_success']
49 | weekly_run = _monday_9ampacific_in_utc_time()
50 | print(f"now: {now}, prev_run_date: {prev_run_date}, weekly_run: {weekly_run}")
51 | if (
52 | now > weekly_run
53 | and (prev_run_date is None or prev_run_date < weekly_run)
54 | ):
55 | # Last successful run was before 9am on Monday, so we should run
56 | print(f"Last run {prev_run_date} was before {weekly_run}, so we should run")
57 | return "notify"
58 | return "skip_notify"
59 |
60 | check_last_successful = check_last_successful()
61 |
62 | check_last_successful >> skip_notify
63 |
64 | check_last_successful >> notify
65 |
66 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_notify_channel_weekly_mods.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | import pytz
3 |
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 | from airflow.operators.empty import EmptyOperator
7 | from airflow.decorators import task
8 |
9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 |
12 | default_args = {
13 | 'owner': 'karma3labs',
14 | 'retries': 5,
15 | 'retry_delay': timedelta(minutes=2),
16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 |
19 | def wed_9ampacific_in_utc_time():
20 | wednesday_dow = 2
21 | pacific_tz = pytz.timezone('US/Pacific')
22 | pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
23 | pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
24 | utc_time = pacific_time.astimezone(pytz.utc)
25 | return utc_time - timedelta(days=utc_time.weekday() - wednesday_dow)
26 |
27 | with DAG(
28 | dag_id='notify_channel_weekly_mods',
29 | default_args=default_args,
30 | description='weekly notifications to mods',
31 | start_date=datetime(2024, 7, 10, 18),
32 | schedule_interval='30 16 * * 3', # every Wednesday at 16:30/17:30 UTC / 09:30 Pacific
33 | is_paused_upon_creation=True,
34 | max_active_runs=1,
35 | catchup=False,
36 | ) as dag:
37 |
38 | skip_notify = EmptyOperator(task_id="skip_notify")
39 |
40 | notify = BashOperator(
41 | task_id="notify",
42 | bash_command=(
43 | "cd /pipeline && ./run_notify_channel_weekly_mods.sh "
44 | " -w . -v .venv -b channels/Bot_Fids.csv -s '{{ prev_data_interval_end_success }}'"),
45 | dag=dag)
46 |
47 | @task.branch(task_id="check_last_successful")
48 | def check_last_successful(**context) -> bool:
49 | now = datetime.now(pytz.utc)
50 | prev_run_date = context['prev_data_interval_end_success']
51 | weekly_run = wed_9ampacific_in_utc_time()
52 | print(f"now: {now}, prev_run_date: {prev_run_date}, weekly_run: {weekly_run}")
53 | if (
54 | now > weekly_run
55 | and (prev_run_date is None or prev_run_date < weekly_run)
56 | ):
57 | # Last successful run was before today, so we should run
58 | print(f"Last run {prev_run_date} was before {weekly_run}, so we should run")
59 | return "notify"
60 | return "skip_notify"
61 |
62 | check_last_successful = check_last_successful()
63 |
64 | check_last_successful >> skip_notify
65 |
66 | check_last_successful >> notify
67 |
68 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_refresh_rank_view_v0.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from airflow.sensors.external_task import ExternalTaskSensor
6 |
7 | from hooks.discord import send_alert_discord
8 | from hooks.pagerduty import send_alert_pagerduty
9 |
10 | default_args = {
11 | 'owner': 'coder2j',
12 | 'retries': 5,
13 | 'retry_delay': timedelta(minutes=2),
14 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
15 | }
16 |
17 | with DAG(
18 | dag_id='refresh_rank_view_v0',
19 | default_args=default_args,
20 | description='This refreshes k3l_rank materialized view and vacuums k3l_rank table',
21 | start_date=datetime(2024, 7, 9, 18),
22 | # schedule_interval='0 1-23/6 * * *',
23 | schedule=None,
24 | catchup=False,
25 | ) as dag:
26 |
27 | task1 = BashOperator(
28 | task_id='refresh_view_k3l_rank_e8',
29 | bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . "
30 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank; "
31 | '''
32 | )
33 |
34 | task2 = BashOperator(
35 | task_id='vacuum_k3l_rank_e8',
36 | bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . "
37 | VACUUM ANALYZE k3l_rank; "
38 | '''
39 | )
40 |
41 | task1 >> task2
42 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_run_cast_pipeline_v0.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from airflow.decorators import task
6 |
7 | from hooks.discord import send_alert_discord
8 | from hooks.pagerduty import send_alert_pagerduty
9 |
10 | default_args = {
11 | 'owner': 'coder2j',
12 | 'retries': 5,
13 | 'retry_delay': timedelta(minutes=2),
14 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
15 | }
16 |
17 | with DAG(
18 | dag_id='run_cast_pipeline_v0',
19 | default_args=default_args,
20 | description='extract cast interactions and refresh pg statistics',
21 | start_date=datetime(2024, 7, 9, 18),
22 | # schedule_interval='*/10 * * * *',
23 | schedule_interval=timedelta(minutes=5),
24 | max_active_runs=1,
25 | is_paused_upon_creation=True,
26 | catchup=False,
27 | ) as dag:
28 |
29 | insert = BashOperator(
30 | task_id='insert_cast_actions',
31 | bash_command='cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/ '
32 | )
33 |
34 | insert8 = BashOperator(
35 | task_id='insert_cast_actions_e8',
36 | bash_command='cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/ -p eigen8 '
37 | )
38 |
39 | refresh = BashOperator(
40 | task_id='refresh_parent_casts_view',
41 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
42 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_parent_casts;"
43 | '''
44 | )
45 |
46 | refresh8 = BashOperator(
47 | task_id='refresh_parent_casts_view_e8',
48 | bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . "
49 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_parent_casts;"
50 | '''
51 | )
52 |
53 | @task.bash
54 | def gapfill_task(db: str) -> str:
55 | yesterday = datetime.now(timezone.utc) - timedelta(hours=25)
56 | return f"cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/"\
57 | f" -f gapfill -p {db} -t '{yesterday.strftime('%Y-%m-%d %H:%M:%S')}'"
58 |
59 | gapfill = gapfill_task.override(task_id='gapfill_cast_actions')('eigen2')
60 | gapfill8 = gapfill_task.override(task_id='gapfill_cast_actions_e8')('eigen8')
61 |
62 | insert >> refresh >> gapfill
63 | insert8 >> refresh8 >> gapfill8
64 |
--------------------------------------------------------------------------------
/pipeline/dags/dag_update_channel_points.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 |
6 | from hooks.discord import send_alert_discord
7 | from hooks.pagerduty import send_alert_pagerduty
8 |
9 | default_args = {
10 | 'owner': 'karma3labs',
11 | 'retries': 5,
12 | 'retry_delay': timedelta(minutes=2),
13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 |
16 | with DAG(
17 | dag_id='update_channel_points_v2',
18 | default_args=default_args,
19 | description='update channel points triggered by update_channel_tokens dag',
20 | start_date=datetime(2024, 7, 10, 18),
21 | schedule_interval='0 16 * * *', # every day at 17:00 UTC / 09:00 Pacific
22 | # schedule_interval=timedelta(days=1),
23 | # schedule=None,
24 | is_paused_upon_creation=True,
25 | max_active_runs=1,
26 | catchup=False,
27 | ) as dag:
28 |
29 | # run_genesis = BashOperator(
30 | # task_id="run_genesis",
31 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t genesis",
32 | # dag=dag)
33 |
34 | # daily_calc = BashOperator(
35 | # task_id="daily_calc",
36 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t compute",
37 | # dag=dag)
38 |
39 | # balance_update = BashOperator(
40 | # task_id="balance_update",
41 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t update",
42 | # dag=dag)
43 |
44 | # run_genesis8 = BashOperator(
45 | # task_id="run_genesis8",
46 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t genesis -p eigen8",
47 | # dag=dag)
48 |
49 | daily_calc8 = BashOperator(
50 | task_id="daily_calc8",
51 | bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t compute -p eigen8",
52 | dag=dag)
53 |
54 | balance_update8 = BashOperator(
55 | task_id="balance_update8",
56 | bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t update -p eigen8",
57 | dag=dag)
58 |
59 | backup_to_s3 = BashOperator(
60 | task_id='backup_channel_points_bal',
61 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh backup_channel_points_bal "
62 | )
63 |
64 | # run_genesis >> daily_calc >> balance_update >> backup_to_s3
65 | # run_genesis8 >> daily_calc8 >> balance_update8
66 | daily_calc8 >> balance_update8 >> backup_to_s3
67 |
--------------------------------------------------------------------------------
/pipeline/dags/extractors/dag_cura_mod.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 |
6 | from hooks.discord import send_alert_discord
7 | from hooks.pagerduty import send_alert_pagerduty
8 |
9 | default_args = {
10 | "owner": "karma3labs",
11 | "retries": 1,
12 | "retry_delay": timedelta(minutes=5),
13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 |
16 | with DAG(
17 | "extract_cura_mod",
18 | default_args=default_args,
19 | description="Fetch hidden fids from CURA API and load into DB daily",
20 | schedule_interval=timedelta(minutes=5),
21 | # schedule_interval=None,
22 | start_date=datetime(2024, 8, 1),
23 | is_paused_upon_creation=True,
24 | max_active_runs=1,
25 | catchup=False,
26 | ) as dag:
27 |
28 | fetch_task = BashOperator(
29 | task_id='extract_cura_hidden_fids',
30 | bash_command="cd /pipeline; extractors/extract_cura_mod.sh -w . -v .venv -r ",
31 | dag=dag
32 | )
33 |
34 | fetch_task
--------------------------------------------------------------------------------
/pipeline/dags/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/dags/monitoring/__init__.py
--------------------------------------------------------------------------------
/pipeline/dags/one_off/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/dags/one_off/.placeholder
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_gen_globaltrust_by_date_v0.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from hooks.discord import send_alert_discord
6 |
7 |
8 | default_args = {
9 | 'owner': 'coder2j',
10 | 'retries': 5,
11 | 'retry_delay': timedelta(minutes=2),
12 | # 'on_failure_callback': send_alert_discord,
13 | }
14 |
15 | # 2024-06-04 00:00
16 | # 875822
17 | # 2024-06-05 00:00
18 | # 875822
19 | # 2024-06-11 00:00
20 | # 921037
21 | # 2024-06-12 00:00
22 | # 921037
23 | # 2024-06-15 00:00
24 | # 960387
25 | # 2024-06-16 00:00
26 | # 960387
27 | with DAG(
28 | dag_id='one_off_gen_globaltrust_by_date_v0',
29 | default_args=default_args,
30 | description='This runs run_globaltrust_pipeline.sh without any optimization',
31 | schedule_interval=None,
32 | start_date=None,
33 | is_paused_upon_creation=True,
34 | max_active_runs=1,
35 | catchup=False,
36 | ) as dag:
37 | push_to_dune = BashOperator(
38 | task_id='push_to_dune',
39 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3 "
40 | )
41 |
42 | task1 = BashOperator(
43 | task_id='06-05',
44 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-05"
45 | )
46 |
47 | task2 = BashOperator(
48 | task_id='06-12',
49 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-12"
50 | )
51 |
52 | task3 = BashOperator(
53 | task_id='06-16',
54 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-16"
55 | )
56 |
57 | task5 = BashOperator(
58 | task_id='06-04',
59 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-04"
60 | )
61 |
62 | task6 = BashOperator(
63 | task_id='06-11',
64 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-11"
65 | )
66 |
67 | task7 = BashOperator(
68 | task_id='06-15',
69 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-15 "
70 | )
71 |
72 | task1 >> task2 >> task3 >> push_to_dune >> task5 >> task6 >> task7
73 |
74 |
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_gen_globaltrust_by_date_v1.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.utils.trigger_rule import TriggerRule
5 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
6 | from airflow.operators.bash import BashOperator
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 |
12 | default_args = {
13 | 'owner': 'karma3labs',
14 | 'retries': 5,
15 | 'retry_delay': timedelta(minutes=2),
16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 |
19 | with DAG(
20 | dag_id='one_off_gen_globaltrust_by_date_v1',
21 | default_args=default_args,
22 | description='This runs run_globaltrust_pipeline.sh without any optimization',
23 | start_date=datetime(2024, 8, 16),
24 | schedule_interval=None,
25 | is_paused_upon_creation=True,
26 | max_active_runs=1,
27 | catchup=False,
28 | ) as dag:
29 |
30 | mkdir_tmp = BashOperator(
31 | task_id="mkdir_tmp",
32 | bash_command= "cd /pipeline; mkdir -p tmp/{{ run_id }}; mkdir -p tmp/graph_files",
33 | dag=dag)
34 |
35 | prep_globaltrust = BashOperator(
36 | task_id="prep_globaltrust",
37 | bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s prep"
38 | " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26",
39 | dag=dag)
40 |
41 | compute_engagement = BashOperator(
42 | task_id="compute_engagement",
43 | bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s compute_engagement"
44 | " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26",
45 | dag=dag)
46 |
47 |
48 | insert_db = BashOperator(
49 | task_id="insert_db",
50 | bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s insert_db"
51 | " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26",
52 | dag=dag)
53 |
54 | upload_to_dune = BashOperator(
55 | task_id="upload_to_dune",
56 | bash_command= "cd /pipeline/dags/pg_to_dune; ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3",
57 | dag=dag)
58 |
59 | trigger_refresh_views = TriggerDagRunOperator(
60 | task_id="trigger_refresh_views",
61 | trigger_dag_id="refresh_rank_view_v0",
62 | conf={"trigger": "gen_globaltrust_v1"},
63 | )
64 |
65 | # trigger_sync_sandbox = TriggerDagRunOperator(
66 | # task_id="trigger_sync_sandbox",
67 | # trigger_dag_id="sync_sandbox_globaltrust",
68 | # conf={"trigger": "gen_globaltrust_v1"},
69 | # )
70 |
71 | (
72 | mkdir_tmp
73 | >> prep_globaltrust
74 | >> compute_engagement
75 | >> insert_db
76 | >> upload_to_dune
77 | >> trigger_refresh_views
78 | # >> trigger_sync_sandbox
79 | )
80 |
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_insert_to_dune_table.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from hooks.discord import send_alert_discord
6 | from hooks.pagerduty import send_alert_pagerduty
7 |
8 |
9 | default_args = {
10 | 'owner': 'coder2j',
11 | 'retries': 5,
12 | 'retry_delay': timedelta(minutes=2),
13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 |
16 |
17 | with DAG(
18 | dag_id='one_off_insert_to_dune_tables',
19 | default_args=default_args,
20 | description='This inserts globaltrust and channel_ranking into dune',
21 | schedule_interval=None,
22 | start_date=None,
23 | is_paused_upon_creation=True,
24 | max_active_runs=1,
25 | catchup=False,
26 | ) as dag:
27 | task4 = BashOperator(
28 | task_id='overwrite_globaltrust_in_dune_v3',
29 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3"
30 | )
31 |
32 | task5 = BashOperator(
33 | task_id='overwrite_channel_rank_in_dune_v3',
34 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_channel_rank_in_dune_v3"
35 | )
36 |
37 | [task4, task5]
38 |
39 |
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_migrate_dune_table.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 |
6 |
7 | default_args = {
8 | 'owner': 'coder2j',
9 | 'retries': 5,
10 | 'retry_delay': timedelta(minutes=2)
11 | }
12 |
13 |
14 | with DAG(
15 | dag_id='one_off_migrate_dune_table',
16 | default_args=default_args,
17 | description='This backs up globaltrust, localtrust and channel_ranking into s3',
18 | schedule_interval=None,
19 | start_date=None,
20 | is_paused_upon_creation=True,
21 | max_active_runs=1,
22 | catchup=False,
23 | ) as dag:
24 | task1 = BashOperator(
25 | task_id='create_dune_globaltrust_table',
26 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh create_dune_globaltrust_table dataset_k3l_cast_globaltrust_v2"
27 | )
28 |
29 | [task1]
30 |
31 |
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_trial_branch.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import pytz
4 | import datetime
5 | from airflow import DAG
6 | from airflow.utils.trigger_rule import TriggerRule
7 | from airflow.operators.empty import EmptyOperator
8 | from airflow.operators.python import PythonOperator
9 |
10 | from airflow.decorators import task, task_group
11 |
12 | default_args = {
13 | 'owner': 'karma3labs',
14 | 'retries': 5,
15 | 'retry_delay': timedelta(minutes=2),
16 | }
17 |
18 | def _monday_9ampacific_in_utc_time():
19 | pacific_tz = pytz.timezone('US/Pacific')
20 | pacific_9am_str = ' '.join([datetime.datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
21 | pacific_time = pacific_tz.localize(datetime.datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
22 | utc_time = pacific_time.astimezone(pytz.utc)
23 | monday = utc_time - timedelta(days=utc_time.weekday())
24 | return monday
25 |
26 | with DAG(
27 | dag_id='one_off_trial_branch',
28 | default_args=default_args,
29 | description="One off dag to test new features",
30 | schedule_interval=None,
31 | start_date=None,
32 | is_paused_upon_creation=True,
33 | max_active_runs=1,
34 | catchup=False,
35 | ) as dag:
36 |
37 | @task.branch(task_id="branch")
38 | def branch_fn(**context):
39 | print(f"context: {context}")
40 | prev = context['prev_execution_date_success']
41 | print(f"prev_execution_date_success: {prev}")
42 | if prev > _monday_9ampacific_in_utc_time():
43 | return "t2"
44 | return "t1"
45 |
46 | def empty_fn(*args, **kwargs):
47 | pass
48 |
49 | branch = branch_fn()
50 | t1 = EmptyOperator(task_id="t1")
51 | t2 = EmptyOperator(task_id="t2")
52 |
53 |
54 | @task_group(group_id='all_group')
55 | def tg_all():
56 | always = PythonOperator(task_id="always",
57 | python_callable=empty_fn,
58 | op_args=[],
59 | op_kwargs={},
60 | trigger_rule=TriggerRule.ALL_SUCCESS)
61 | t3 = EmptyOperator(task_id="t3")
62 |
63 | always >> t3
64 |
65 | @task_group(group_id='some_group')
66 | def tg_some():
67 | always = PythonOperator(task_id="always",
68 | python_callable=empty_fn,
69 | op_args=[],
70 | op_kwargs={},
71 | trigger_rule=TriggerRule.ALL_SUCCESS)
72 | sometimes = EmptyOperator(task_id="sometimes")
73 | t3 = EmptyOperator(task_id="t3")
74 |
75 | always >> sometimes >> t3
76 |
77 | branch >> t1 >> tg_all()
78 | branch >> t2 >> tg_some()
79 |
80 |
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_trial_sql.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.empty import EmptyOperator
5 | from airflow.providers.common.sql.operators.sql import SQLCheckOperator
6 |
7 | default_args = {
8 | "owner": "karma3labs",
9 | "retries": 0,
10 | "retry_delay": timedelta(minutes=5),
11 | }
12 |
13 | _CONN_ID = "eig2_readonly_user"
14 | CHECK_QUERY = """
15 | WITH
16 | channel_rank_stats AS (
17 | SELECT
18 | COUNT(*) AS tot_rows,
19 | COUNT(DISTINCT channel_id) AS tot_channels,
20 | strategy_name
21 | FROM k3l_channel_rank
22 | GROUP BY strategy_name
23 | ),
24 | channel_fids_stats as (
25 | SELECT
26 | COUNT(*) AS tot_rows,
27 | COUNT(DISTINCT channel_id) AS tot_channels,
28 | strategy_name
29 | -- TODO change table name to k3l_channel_fids
30 | FROM k3l_channel_rank
31 | GROUP BY strategy_name
32 | )
33 | SELECT
34 | BOOL_AND(
35 | t2.tot_rows >= t1.tot_rows
36 | AND t2.tot_channels >= t1.tot_channels
37 | AND t2.strategy_name IS NOT NULL
38 | )
39 | FROM channel_rank_stats as t1
40 | LEFT JOIN channel_fids_stats as t2 ON (t2.strategy_name = t1.strategy_name)
41 | """
42 |
43 | with DAG(
44 | "one_off_trial_sql",
45 | default_args=default_args,
46 | description="One off dag to test new features",
47 | schedule_interval=None,
48 | start_date=None,
49 | is_paused_upon_creation=True,
50 | max_active_runs=1,
51 | catchup=False,
52 | ) as dag:
53 |
54 | start = EmptyOperator(task_id="start")
55 |
56 | sql_check = SQLCheckOperator(
57 | task_id='sql_check',
58 | sql=CHECK_QUERY,
59 | conn_id=_CONN_ID
60 | )
61 |
62 | end = EmptyOperator(task_id="end")
63 |
64 | start >> sql_check >> end
65 |
--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_trial_task_groups.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.empty import EmptyOperator
5 | from airflow.operators.bash import BashOperator
6 | from airflow.decorators import task_group
7 |
8 | default_args = {
9 | "owner": "karma3labs",
10 | "retries": 1,
11 | "retry_delay": timedelta(minutes=5),
12 | }
13 |
14 | with DAG(
15 | "one_off_trial_task_groups",
16 | default_args=default_args,
17 | description="One off dag to test new features",
18 | schedule_interval=None,
19 | start_date=None,
20 | is_paused_upon_creation=True,
21 | max_active_runs=1,
22 | catchup=False,
23 | ) as dag:
24 |
25 | @task_group(group_id='my_start_group')
26 | def tg_start():
27 | start = EmptyOperator(task_id="start")
28 |
29 | echo1 = BashOperator(
30 | task_id="echo1",
31 | bash_command= "echo {{ (logical_date - macros.timedelta(days=90)) | ds }}",
32 | dag=dag
33 | )
34 |
35 | echo2 = BashOperator(
36 | task_id="echo2",
37 | bash_command= "echo '{{ prev_data_interval_end_success }}'",
38 | dag=dag
39 | )
40 |
41 | start >> echo1 >> echo2
42 |
43 | @task_group(group_id='my_echo_group')
44 | def tg_echo():
45 |
46 | echo3 = BashOperator(
47 | task_id="echo3",
48 | bash_command= "echo {{ macros.ds_add(ds, -90) }}",
49 | dag=dag
50 | )
51 |
52 | echo4 = BashOperator(
53 | task_id="echo4",
54 | bash_command= "echo {{ ds }}",
55 | dag=dag
56 | )
57 |
58 | echo5 = BashOperator(
59 | task_id="echo5",
60 | bash_command= "echo {{ logical_date }}",
61 | dag=dag
62 | )
63 | echo3 >> echo4
64 | echo5
65 |
66 | end = EmptyOperator(task_id="end")
67 |
68 | tg_start() >> tg_echo() >> end
69 |
70 |
--------------------------------------------------------------------------------
/pipeline/dags/pg_to_dune/.env.sample:
--------------------------------------------------------------------------------
1 | DB_HOST=localhost
2 | DB_PORT=5432
3 | DB_NAME=farcaster
4 | DB_SSLMODE=allow
5 | DB_USERNAME=k3l_user
6 | DB_PASSWORD=changeme
7 | AWS_ACCESS_KEY_ID="changeme"
8 | AWS_SECRET_ACCESS_KEY="changeme"
9 | AWS_REGION="eu-central-1"
10 | GCP_TASK_ACCT="changeme"
11 | GCS_BUCKET_NAME="changeme"
12 | S3_BUCKET_NAME_CONSTANT="changeme"
13 | DUNE_API_KEY="changeme"
--------------------------------------------------------------------------------
/pipeline/dags/pg_to_dune/app/check_last_timestamp.py:
--------------------------------------------------------------------------------
1 |
2 | import os, json
3 | from dune_client.types import QueryParameter
4 | from dune_client.client import DuneClient
5 | from dune_client.query import QueryBase
6 |
7 | # change the current working directory where .env file lives
8 | # os.chdir("/Users/abc/local-Workspace/python-notebook-examples")
9 | # load .env file
10 | # dotenv.load_dotenv(".env")
11 | # setup Dune Python client
12 | dune = DuneClient(os.environ["DUNE_API_KEY"])
13 |
14 | query = QueryBase(
15 | name="fetch last date of globaltrust_v2",
16 | query_id=int(os.environ["QUERY_ID"]),
17 | )
18 |
19 | result = dune.run_query(
20 | query = query,
21 | # performance = 'large' # optionally define which tier to run the execution on (default is "medium")
22 | )
23 |
24 | if len(result.result.rows) != 1:
25 | raise "not one"
26 |
27 | last_date = result.result.rows[0][os.environ["FILTER_COLUMN"]]
28 | print(last_date)
29 | # # go over the results returned
30 | # for row in result.result.rows:
31 | # print('hell')
32 | # print (row) # as an example we print the rows
33 |
--------------------------------------------------------------------------------
/pipeline/dags/reports/dag_gen_channel_metrics.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | # from airflow.operators.empty import EmptyOperator
5 | from airflow.operators.bash import BashOperator
6 |
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'karma3labs',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 |
19 | with DAG(
20 | dag_id='report_gen_metrics',
21 | default_args=default_args,
22 | description='this generates channel metrics',
23 | start_date=datetime(2024, 8, 15),
24 | schedule_interval='0 */6 * * *',
25 | is_paused_upon_creation=True,
26 | max_active_runs=1,
27 | catchup=False,
28 | ) as dag:
29 |
30 | # gen_channel_metrics = EmptyOperator(task_id="gen_channel_metrics")
31 |
32 | gen_channel_metrics = BashOperator(
33 | task_id='gen_channel_metrics',
34 | bash_command='cd /pipeline/ && ./run_channel_metrics.sh -w . -v ./.venv/ -r '
35 | )
--------------------------------------------------------------------------------
/pipeline/dags/reports/dag_gen_labels.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
6 |
7 |
8 | from hooks.discord import send_alert_discord
9 | from hooks.pagerduty import send_alert_pagerduty
10 |
11 | default_args = {
12 | 'owner': 'karma3labs',
13 | 'retries': 5,
14 | 'retry_delay': timedelta(minutes=2),
15 | # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 |
18 |
19 | with DAG(
20 | dag_id='report_gen_labels',
21 | default_args=default_args,
22 | description='This fetches spammers and save the list into s3',
23 | start_date=datetime(2024, 8, 15),
24 | schedule_interval='0 0 * * *',
25 | is_paused_upon_creation=True,
26 | max_active_runs=1,
27 | catchup=False,
28 | ) as dag:
29 |
30 | gen_top_spammers = BashOperator(
31 | task_id='gen_top_spammers',
32 | bash_command="cd /pipeline && ./run_fetch_top_spammers.sh -v ./.venv"
33 | )
34 |
35 | gen_top_casters = BashOperator(
36 | task_id='gen_top_casters',
37 | bash_command="cd /pipeline && ./run_fetch_top_caster.sh -v ./.venv"
38 | )
39 |
40 | trigger_sync_sandbox = TriggerDagRunOperator(
41 | task_id="trigger_sync_sandbox",
42 | trigger_dag_id="sync_sandbox_db_labels",
43 | conf={"trigger": "report_gen_labels"},
44 | )
45 |
46 | gen_top_spammers >> gen_top_casters >> trigger_sync_sandbox
47 |
48 |
--------------------------------------------------------------------------------
/pipeline/dags/triggers/trigger_gen_channel_ranking_v3.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | from airflow.operators.empty import EmptyOperator
3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
4 | from airflow.decorators import task, dag
5 | from airflow.models import DagRun
6 | from airflow.utils.state import DagRunState
7 |
8 | default_args = {
9 | 'owner': 'karma3labs',
10 | 'retries': 5,
11 | 'retry_delay': timedelta(minutes=2),
12 | }
13 |
14 | N_CHUNKS = 100 # Define the number of chunks
15 | FREQUENCY_H = 12 # Define the frequency in hours
16 |
17 | @dag(
18 | dag_id='trigger_gen_channel_ranking_v3',
19 | default_args=default_args,
20 | start_date=datetime(2024, 10, 1),
21 | schedule_interval=timedelta(hours=6),
22 | is_paused_upon_creation=True,
23 | max_active_runs=1,
24 | catchup=False # To avoid backfilling if not required
25 | )
26 | def create_trigger_dag():
27 | skip_main_dag = EmptyOperator(task_id="skip_main_dag")
28 |
29 | trigger_main_dag = TriggerDagRunOperator(
30 | task_id='trigger_main_dag',
31 | trigger_dag_id='gen_channel_ranking_v3',
32 | execution_date='{{ macros.datetime.now() }}',
33 | conf={"trigger": "trigger_gen_channel_ranking_v3"},
34 | )
35 |
36 | @task.branch(task_id="check_last_successful_run")
37 | def check_last_successful_run(**context) -> bool:
38 | dag_runs = DagRun.find(dag_id="gen_channel_ranking_v3", state=DagRunState.SUCCESS)
39 | if not dag_runs or len(dag_runs) == 0:
40 | # No previous runs
41 | print("No previous runs")
42 | return "trigger_main_dag"
43 | print(f"Found {len(dag_runs)} previous runs")
44 | dag_runs.sort(key=lambda x: x.execution_date, reverse=True)
45 | print("Last run: ", dag_runs[0])
46 | # Query the last successful DAG run
47 | last_run = dag_runs[0]
48 | print("Last run: ", last_run)
49 | current_time = datetime.now(timezone.utc)
50 | delta = FREQUENCY_H
51 | if last_run:
52 | print("Last run end_date: ", last_run.end_date)
53 | print("Last run start_date: ", last_run.start_date)
54 | if last_run.end_date:
55 | delta_last = (current_time - last_run.end_date).total_seconds() / 3600
56 | delta = min(delta_last, delta)
57 | if last_run.start_date:
58 | delta_last = (current_time - last_run.start_date).total_seconds() / 3600
59 | delta = min(delta_last, delta)
60 | print(f"Delta: {delta}")
61 | if delta >= FREQUENCY_H:
62 | # Last run was more than FREQUENCY_H hours ago, so we should run
63 | print(f"Last run was more than {FREQUENCY_H} hours ago, so we should run")
64 | return "trigger_main_dag"
65 | return "skip_main_dag"
66 |
67 | check_last_successful_run = check_last_successful_run()
68 |
69 | check_last_successful_run >> trigger_main_dag
70 |
71 | check_last_successful_run >> skip_main_dag
72 |
73 | trigger_dag = create_trigger_dag()
74 |
75 |
--------------------------------------------------------------------------------
/pipeline/dags/triggers/trigger_gen_channel_ranking_v4.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | from airflow.operators.empty import EmptyOperator
3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
4 | from airflow.decorators import task, dag
5 | from airflow.models import DagRun
6 | from airflow.utils.state import DagRunState
7 |
8 | default_args = {
9 | 'owner': 'karma3labs',
10 | 'retries': 5,
11 | 'retry_delay': timedelta(minutes=2),
12 | }
13 |
14 | N_CHUNKS = 100 # Define the number of chunks
15 | FREQUENCY_H = 24 # Define the frequency in hours
16 |
17 | @dag(
18 | dag_id='trigger_gen_channel_ranking_v4',
19 | default_args=default_args,
20 | start_date=datetime(2024, 10, 1),
21 | schedule_interval=timedelta(hours=24),
22 | is_paused_upon_creation=True,
23 | max_active_runs=1,
24 | catchup=False # To avoid backfilling if not required
25 | )
26 | def create_trigger_dag():
27 | skip_main_dag = EmptyOperator(task_id="skip_main_dag")
28 |
29 | trigger_main_dag = TriggerDagRunOperator(
30 | task_id='trigger_main_dag',
31 | trigger_dag_id='gen_channel_ranking_v4',
32 | execution_date='{{ macros.datetime.now() }}',
33 | conf={"trigger": "trigger_gen_channel_ranking_v4"},
34 | )
35 |
36 | @task.branch(task_id="check_last_successful_run")
37 | def check_last_successful_run(**context) -> bool:
38 | dag_runs = DagRun.find(dag_id="gen_channel_ranking_v4", state=DagRunState.SUCCESS)
39 | if not dag_runs or len(dag_runs) == 0:
40 | # No previous runs
41 | print("No previous runs")
42 | return "trigger_main_dag"
43 | print(f"Found {len(dag_runs)} previous runs")
44 | dag_runs.sort(key=lambda x: x.execution_date, reverse=True)
45 | print("Last run: ", dag_runs[0])
46 | # Query the last successful DAG run
47 | last_run = dag_runs[0]
48 | print("Last run: ", last_run)
49 | current_time = datetime.now(timezone.utc)
50 | delta = FREQUENCY_H
51 | if last_run:
52 | print("Last run end_date: ", last_run.end_date)
53 | print("Last run start_date: ", last_run.start_date)
54 | if last_run.end_date:
55 | delta_last = (current_time - last_run.end_date).total_seconds() / 3600
56 | delta = min(delta_last, delta)
57 | if last_run.start_date:
58 | delta_last = (current_time - last_run.start_date).total_seconds() / 3600
59 | delta = min(delta_last, delta)
60 | print(f"Delta: {delta}")
61 | if delta >= FREQUENCY_H:
62 | # Last run was more than FREQUENCY_H hours ago, so we should run
63 | print(f"Last run was more than {FREQUENCY_H} hours ago, so we should run")
64 | return "trigger_main_dag"
65 | return "skip_main_dag"
66 |
67 | check_last_successful_run = check_last_successful_run()
68 |
69 | check_last_successful_run >> trigger_main_dag
70 |
71 | check_last_successful_run >> skip_main_dag
72 |
73 | trigger_dag = create_trigger_dag()
74 |
75 |
--------------------------------------------------------------------------------
/pipeline/extractors/automod_extractor.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from datetime import date
3 | import requests
4 | from sqlalchemy import create_engine
5 | from sqlalchemy import text
6 | import io
7 | from loguru import logger
8 | import sys
9 |
10 |
11 | def fetch_data_from_api(api_key, db_user, db_password, db_endpoint):
12 | params = {'start': '2024-01-01', 'end': '2024-12-31'}
13 | headers = {'api-key': f"{api_key}"}
14 | df_automod = pd.DataFrame()
15 | for channel in ["degen", "dev", "memes"]:
16 | initial_url = f"https://automod.sh/api/partners/channels/{channel}/activity/export?"
17 | response = requests.get(initial_url, params=params, headers=headers)
18 | print(response.url)
19 | if response.status_code == 200:
20 | # Read the response content into a pandas DataFrame
21 | data = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
22 | data["channel_id"] = channel
23 | print(len(data))
24 | df_automod = pd.concat([df_automod, data], axis=0)
25 | else:
26 | raise Exception(f"Failed to fetch data from automod. Status code: {response.status_code}")
27 |
28 | if len(df_automod) == 0:
29 | raise Exception("Failed to fetch data from automod. No data found.")
30 |
31 | rename_dict = {
32 | 'createdAt': 'created_at',
33 | 'affectedUsername': 'affected_username',
34 | 'affectedUserFid': 'affected_userid',
35 | 'castHash': 'cast_hash',
36 | 'castText': 'cast_text'
37 | }
38 |
39 | df_automod.rename(columns=rename_dict, inplace=True)
40 | df_automod = df_automod[
41 | ["created_at", "action", "actor", "affected_username", "affected_userid", "cast_hash", "channel_id"]]
42 | df_automod['created_at'] = pd.to_datetime(df_automod['created_at'], unit='ms')
43 | df_automod["date_iso"] = date.today()
44 |
45 | logger.info(df_automod.head())
46 | engine_string = "postgresql+psycopg2://%s:%s@%s:%d/%s" \
47 | % (db_user, db_password, db_endpoint, 9541, 'farcaster')
48 |
49 | postgres_engine = create_engine(engine_string, connect_args={"connect_timeout": 1000})
50 | with postgres_engine.begin() as conn:
51 | conn.execute(text("TRUNCATE TABLE automod_data"))
52 | df_automod.to_sql('automod_data', con=conn, if_exists='append', index=False)
53 | return None
54 |
55 |
56 | if __name__ == "__main__":
57 | # Get the parameters from the command line arguments
58 | if len(sys.argv) != 5:
59 | raise ValueError("Please provide db_user, db_password, and db_endpoint as arguments.")
60 |
61 | api_key = sys.argv[1]
62 | db_user = sys.argv[2]
63 | db_password = sys.argv[3]
64 | db_endpoint = sys.argv[4]
65 |
66 | fetch_data_from_api(api_key, db_user, db_password, db_endpoint)
--------------------------------------------------------------------------------
/pipeline/extractors/extract_channel_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts w:v:c:s:d flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | esac
9 | done
10 |
11 | if [ -z "$VENV" ] || [ -z "$WORK_DIR" ]; then
12 | echo "Usage: $0 -w [work_dir] -v [venv] "
13 | echo ""
14 | echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv "
15 | echo ""
16 | echo "Params:"
17 | echo " [work_dir] The working directory to read .env file and execute scripts from."
18 | echo " [venv] The path where a python3 virtualenv has been created."
19 | echo ""
20 | exit
21 | fi
22 |
23 | # Setup environment variables
24 | echo "Setting up environment variables"
25 | source $WORK_DIR/.env
26 |
27 | # Activate
28 | echo "Activating Python 3.12 environment"
29 | source $VENV/bin/activate
30 |
31 | # Install
32 | echo "Installing requirements"
33 | #pip install -r requirements.txt
34 |
35 | # Run
36 | echo "Running channel data import"
37 | /usr/bin/env python3 -m extractors.main_channel_data
38 |
39 | if [ $? -ne 0 ]; then
40 | echo "Failed to run script"
41 | exit 1
42 | fi
43 |
44 | # Teardown
45 | echo "Deactivating Python 3.12 environment"
46 | deactivate
47 |
--------------------------------------------------------------------------------
/pipeline/extractors/extract_cura_mod.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts w:v:rd flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | r) RUN_FLAG="--run";;
9 | d) DRYRUN_FLAG="--dry-run";;
10 | esac
11 | done
12 |
13 | if [ -z "$VENV" ] || [ -z "$WORK_DIR" ] || [ -z "$RUN_FLAG" ]; then
14 | echo "Usage: $0 -w [work_dir] -v [venv] -r -d "
15 | echo ""
16 | echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv -r"
17 | echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv -r -d"
18 | echo ""
19 | echo "Params:"
20 | echo " [work_dir] The working directory to read .env file and execute scripts from."
21 | echo " [venv] The path where a python3 virtualenv has been created."
22 | echo " [run] Flag to run the script."
23 | echo " [dryrun] Flag to run the script in dry-run mode."
24 | echo ""
25 | exit
26 | fi
27 |
28 | set -e
29 | set -o pipefail
30 |
31 | # Setup environment variables
32 | echo "Setting up environment variables"
33 | source $WORK_DIR/.env
34 |
35 | # Activate
36 | echo "Activating Python 3.12 environment"
37 | source $VENV/bin/activate
38 |
39 | # Install
40 | echo "Installing requirements"
41 | #pip install -r requirements.txt
42 |
43 | # Run
44 | echo "Running cura channel mod data extractor with flags"
45 | /usr/bin/env python3 -m extractors.cura_mod_extractor $RUN_FLAG $DRYRUN_FLAG
46 |
47 | if [ $? -ne 0 ]; then
48 | echo "Failed to run script"
49 | exit 1
50 | fi
51 |
52 | # Teardown
53 | echo "Deactivating Python 3.12 environment"
54 | deactivate
55 |
--------------------------------------------------------------------------------
/pipeline/extractors/main_channel_data.py:
--------------------------------------------------------------------------------
1 | from config import settings
2 | import utils
3 |
4 | import requests
5 | import pandas as pd
6 | from sqlalchemy import create_engine
7 | from sqlalchemy import text
8 | from loguru import logger
9 |
10 |
11 | def fetch_data_from_api():
12 | initial_url = "https://api.warpcast.com/v2/all-channels"
13 | response = requests.get(initial_url)
14 |
15 | df_warpcast_channels = pd.DataFrame(response.json()["result"]["channels"])
16 | df_warpcast_channels['createdAt'] = pd.to_datetime(df_warpcast_channels['createdAt'], unit='ms')
17 | df_warpcast_channels.columns = df_warpcast_channels.columns.str.lower()
18 | db_column_names = [
19 | "id",
20 | "url",
21 | "name",
22 | "description",
23 | "imageurl",
24 | "headerimageurl",
25 | "leadfid",
26 | "moderatorfids",
27 | "createdat",
28 | "followercount",
29 | "membercount",
30 | "pinnedcasthash",
31 | ]
32 | df_warpcast_channels = df_warpcast_channels.filter(items=db_column_names, axis=1)
33 | logger.info(utils.df_info_to_string(df_warpcast_channels, with_sample=True))
34 |
35 | if len(df_warpcast_channels) == 0:
36 | raise Exception("Failed to fetch data from warpcast. No data found.")
37 |
38 | postgres_engine = create_engine(settings.POSTGRES_URL.get_secret_value(), connect_args={"connect_timeout": 1000})
39 | try:
40 | with postgres_engine.begin() as conn:
41 | conn.execute(text("TRUNCATE TABLE warpcast_channels_data"))
42 | df_warpcast_channels.to_sql('warpcast_channels_data', con=conn, if_exists='append', index=False)
43 | except Exception as e:
44 | logger.error(f"Failed to insert data into postgres: {e}")
45 | raise e
46 |
47 |
48 | if __name__ == "__main__":
49 | fetch_data_from_api()
50 |
--------------------------------------------------------------------------------
/pipeline/frames/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/frames/__init__.py
--------------------------------------------------------------------------------
/pipeline/frames/frames_db_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from timer import Timer
4 |
5 | import psycopg2
6 | import psycopg2.extras
7 |
8 |
9 | @Timer(name="fetch_unprocessed_urls")
10 | def fetch_unprocessed_urls(logger: logging.Logger, pg_dsn: str, limit: int) -> list[tuple]:
11 | """return will be of the form [(url_id, url)]"""
12 | fetch_sql = f"""
13 | SELECT url_id, url
14 | FROM k3l_url_labels
15 | WHERE processed_ts IS NULL
16 | ORDER BY earliest_cast_dt ASC
17 | LIMIT {limit}
18 | """
19 | with psycopg2.connect(pg_dsn) as conn:
20 | with conn.cursor() as cursor:
21 | logger.info(f"Executing: {fetch_sql}")
22 | cursor.execute(fetch_sql)
23 | url_records = cursor.fetchall()
24 | return url_records
25 |
26 | @Timer(name="update_url_categories")
27 | def update_url_categories(logger: logging.Logger, pg_dsn: str, url_categories: list[tuple]):
28 | """url_categories should be of the form [(url_id, category)]"""
29 | update_sql = """
30 | UPDATE k3l_url_labels as k
31 | SET processed_ts=now(), category=v.cat
32 | FROM (VALUES %s) AS v(id, cat)
33 | WHERE url_id=v.id;
34 | """
35 | with psycopg2.connect(pg_dsn) as conn:
36 | with conn.cursor() as cursor:
37 | logger.info(f"Executing: {update_sql}")
38 | psycopg2.extras.execute_values(cursor,
39 | update_sql,
40 | url_categories,
41 | template=None,
42 | page_size=100)
43 |
44 | @Timer(name="fetch_unparsed_urls")
45 | def fetch_unparsed_urls(logger: logging.Logger, pg_dsn: str, limit: int) -> list[tuple]:
46 | """return will be of the form [(url_id, url)]"""
47 | fetch_sql = f"""
48 | SELECT url_id, url
49 | FROM k3l_url_labels
50 | WHERE parsed_ts IS NULL
51 | ORDER BY earliest_cast_dt ASC
52 | LIMIT {limit}
53 | """
54 | with psycopg2.connect(pg_dsn) as conn:
55 | with conn.cursor() as cursor:
56 | logger.info(f"Executing: {fetch_sql}")
57 | cursor.execute(fetch_sql)
58 | url_records = cursor.fetchall()
59 | return url_records
60 |
61 | @Timer(name="update_url_parts")
62 | def update_url_parts(logger: logging.Logger, pg_dsn: str, url_parts: list[tuple]):
63 | """url_parts should be of the form [(url_id, scheme, domain, subdomain, tld, path)]"""
64 | update_sql = f"""
65 | UPDATE k3l_url_labels as k
66 | SET parsed_ts=now(), scheme=v.scheme, domain=v.domain, subdomain=v.subdomain, tld=v.tld, path=v.path
67 | FROM (VALUES %s) AS v(id, scheme, domain, subdomain, tld, path)
68 | WHERE url_id=v.id;
69 | """
70 | with psycopg2.connect(pg_dsn) as conn:
71 | with conn.cursor() as cursor:
72 | logger.info(f"Executing: {update_sql}")
73 | psycopg2.extras.execute_values(cursor,
74 | update_sql,
75 | url_parts,
76 | template=None,
77 | page_size=100)
78 |
79 |
--------------------------------------------------------------------------------
/pipeline/frames/incremental_load_cast_mapping.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO k3l_cast_embed_url_mapping(url_id, cast_id)
2 | WITH max_cast_dt AS (
3 | select
4 | max(latest_cast_dt) as dt
5 | from k3l_url_labels as labels
6 | inner join k3l_cast_embed_url_mapping as url_map on (labels.url_id = url_map.url_id)
7 | )
8 | SELECT
9 | labels.url_id as url_id,
10 | casts.id as cast_id
11 | FROM casts
12 | cross join lateral jsonb_array_elements(casts.embeds) as ems
13 | inner join max_cast_dt on (casts.created_at >= max_cast_dt.dt AND casts.deleted_at IS NULL)
14 | inner join
15 | k3l_url_labels as labels
16 | on (labels.url = ems->>'url'
17 | AND jsonb_array_length(embeds) > 0
18 | AND ems->'url' IS NOT NULL
19 | AND ems->>'url' NOT LIKE ALL(ARRAY[
20 | 'https://i.imgur.com/%',
21 | 'https://youtu.be/%',
22 | 'https://www.youtube.com/%',
23 | 'https://imagedelivery.net/%',
24 | '%.png', '%.gif', '%.pdf', '%.jpg', '%.jpeg', '%.mp4', '%.m3u8'])
25 | AND created_at >= max_cast_dt.dt
26 | )
--------------------------------------------------------------------------------
/pipeline/frames/incremental_load_labels.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO k3l_url_labels(url, latest_cast_dt, earliest_cast_dt)
2 | WITH max_cast_dt AS (
3 | select
4 | max(latest_cast_dt) as dt
5 | from k3l_url_labels as labels
6 | inner join k3l_cast_embed_url_mapping as url_map on (labels.url_id = url_map.url_id)
7 | )
8 | SELECT
9 | ems->>'url' as url,
10 | max(created_at) as latest_cast_dt,
11 | min(created_at) as earliest_cast_dt
12 | FROM
13 | casts
14 | cross join lateral jsonb_array_elements(casts.embeds) as ems
15 | inner join max_cast_dt on (casts.created_at >= max_cast_dt.dt AND casts.deleted_at IS NULL)
16 | left join
17 | k3l_url_labels as labels
18 | on (labels.url = ems->>'url'
19 | and casts.created_at >= max_cast_dt.dt
20 | )
21 | WHERE
22 | labels.url_id IS NULL
23 | AND jsonb_array_length(embeds) > 0
24 | AND ems->'url' IS NOT NULL
25 | AND ems->>'url' NOT LIKE ALL(ARRAY[
26 | 'https://i.imgur.com/%',
27 | 'https://youtu.be/%',
28 | 'https://www.youtube.com/%',
29 | 'https://imagedelivery.net/%',
30 | '%.png', '%.gif', '%.pdf', '%.jpg', '%.jpeg', '%.mp4', '%.m3u8'])
31 | GROUP BY ems->>'url'
--------------------------------------------------------------------------------
/pipeline/frames/scrape_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from enum import Enum
3 | from typing import NamedTuple
4 | import asyncio
5 | from urllib.parse import urlparse
6 |
7 | import tldextract
8 | from bs4 import BeautifulSoup
9 | import aiohttp as aiohttp
10 |
11 | class URLCategory(Enum):
12 | FRAME = 'frame'
13 | TIMEOUT = 'timeout'
14 | BAD = 'bad'
15 | UNKNOWN = 'unknown'
16 | ERROR = 'error'
17 |
18 | async def categorize_url(
19 | logger: logging.Logger,
20 | url_id: int, url:str,
21 | session: aiohttp.ClientSession,
22 | timeout: aiohttp.ClientTimeout
23 | ) -> tuple[int, str]:
24 | logger.debug(f"Fetching {url_id} - {url}")
25 | try:
26 | if urlparse(url).scheme not in ['http','https']:
27 | logger.error(f"bad url {url_id} - {url}")
28 | return (url_id, URLCategory.BAD.value)
29 | async with session.get(url, timeout=timeout) as resp:
30 | body = await resp.text()
31 | soup = BeautifulSoup(body, 'html.parser')
32 | frame_meta = soup.find('meta', attrs={"property":"fc:frame"})
33 | return (url_id, URLCategory.FRAME.value) if frame_meta \
34 | else (url_id, URLCategory.UNKNOWN.value)
35 | except asyncio.TimeoutError as e:
36 | logger.error(f"{url_id} - {url} timed out: {e}")
37 | return (url_id, URLCategory.TIMEOUT.value)
38 | except aiohttp.InvalidURL as e:
39 | logger.error(f"bad url {url_id} - {url}: {e}")
40 | return (url_id, URLCategory.BAD.value)
41 | except aiohttp.ClientError as e:
42 | logger.error(f"error {url_id} - {url}: {e}")
43 | return (url_id, URLCategory.ERROR.value)
44 | except aiohttp.ClientError as e:
45 | logger.error(f"error {url_id} - {url}: {e}")
46 | return (url_id, URLCategory.ERROR.value)
47 | except ValueError as e:
48 | logger.error(f"error {url_id} - {url}: {e}")
49 | return (url_id, URLCategory.ERROR.value)
50 | except Exception as e:
51 | logger.error(f"error {url_id} - {url}: {e}")
52 | return (url_id, URLCategory.ERROR.value)
53 |
54 | class URL_parts(NamedTuple):
55 | url_id: int
56 | scheme: str
57 | domain: str
58 | subdomain: str
59 | tld: str
60 | path: str
61 |
62 | def parse_url(
63 | logger: logging.Logger,
64 | url_id: int,
65 | url:str
66 | ) -> tuple[int, str, str, str, str, str]:
67 | logger.debug(f"parsing {url_id} - {url}")
68 | try:
69 | parse_result = urlparse(url)
70 | extract = tldextract.extract(url)
71 | path = parse_result.path
72 | if path.endswith(':'):
73 | path = path[:-1]
74 | return tuple(URL_parts(url_id,
75 | parse_result.scheme,
76 | extract.domain,
77 | extract.subdomain,
78 | extract.suffix,
79 | path))
80 | except Exception as e:
81 | logger.error(f"error {url_id} - {url}: {e}")
82 | return (url_id, '', '', '', '', '')
--------------------------------------------------------------------------------
/pipeline/frames/test_urls.py:
--------------------------------------------------------------------------------
1 | # standard dependencies
2 | import sys
3 |
4 | # local dependencies
5 | from config import settings
6 | from . import scrape_utils
7 |
8 | # 3rd party dependencies
9 | from dotenv import load_dotenv
10 | from loguru import logger
11 |
12 | logger.remove()
13 | level_per_module = {
14 | "": settings.LOG_LEVEL,
15 | "silentlib": False
16 | }
17 | logger.add(sys.stdout,
18 | colorize=True,
19 | format=settings.LOGURU_FORMAT,
20 | filter=level_per_module,
21 | level=0)
22 |
23 | def test():
24 | url = 'https://apis.cast.k3l.io'
25 | url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1)
26 | logger.debug(f"{url} category ? {url_category}")
27 |
28 | url = 'https://cast.k3l.io/apis123'
29 | url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1)
30 | logger.debug(f"{url} category ? {url_category}")
31 |
32 | url = 'https://cast.k3l.io'
33 | url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1)
34 | logger.debug(f"{url} category ? {url_category}")
35 |
36 | url = 'https://dune-frames.vercel.app/api'
37 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)
38 | logger.debug(f"{url} category ? {url_category}")
39 |
40 | url = 'https://www.youtube.com'
41 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)
42 | logger.debug(f"{url} category ? {url_category}")
43 |
44 | url = 'https://www.youttube.com'
45 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)
46 | logger.debug(f"{url} category ? {url_category}")
47 |
48 | url = 'abc'
49 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)
50 | logger.debug(f"{url} category ? {url_category}")
51 |
52 | url = 'http://1'
53 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)
54 | logger.debug(f"{url} category ? {url_category}")
55 |
56 |
57 | if __name__ == "__main__":
58 | load_dotenv()
59 | print(settings)
60 |
61 | logger.debug('####### TODO use pytest ########')
62 | test()
63 |
--------------------------------------------------------------------------------
/pipeline/globaltrust/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/globaltrust/__init__.py
--------------------------------------------------------------------------------
/pipeline/globaltrust/export_localtrust_daily_stats.sql:
--------------------------------------------------------------------------------
1 | with stats_per_strategy_per_date as (SELECT
2 | max(date) AS date,
3 | COUNT(CASE WHEN strategy_id = 1 THEN 1 END) AS strategy_id_1_row_count,
4 | AVG(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_mean,
5 | STDDEV(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_stddev,
6 | MAX(CASE WHEN strategy_id = 1 THEN v END) - MIN(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_range,
7 | COUNT(CASE WHEN strategy_id = 3 THEN 1 END) AS strategy_id_3_row_count,
8 | AVG(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_mean,
9 | STDDEV(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_stddev,
10 | MAX(CASE WHEN strategy_id = 3 THEN v END) - MIN(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_range
11 | FROM
12 | localtrust
13 | -- GROUP BY
14 | -- date
15 | )
16 |
17 | INSERT INTO localtrust_stats (
18 | date,
19 | strategy_id_1_row_count,
20 | strategy_id_1_mean,
21 | strategy_id_1_stddev,
22 | strategy_id_1_range,
23 | strategy_id_3_row_count,
24 | strategy_id_3_mean,
25 | strategy_id_3_stddev,
26 | strategy_id_3_range
27 | )
28 | SELECT
29 | date,
30 | strategy_id_1_row_count,
31 | strategy_id_1_mean,
32 | strategy_id_1_stddev,
33 | strategy_id_1_range,
34 | strategy_id_3_row_count,
35 | strategy_id_3_mean,
36 | strategy_id_3_stddev,
37 | strategy_id_3_range
38 | FROM
39 | stats_per_strategy_per_date;
40 |
--------------------------------------------------------------------------------
/pipeline/globaltrust/queries.py:
--------------------------------------------------------------------------------
1 | from db_utils import SQL
2 |
3 | class IJVSql:
4 | LIKES = SQL("LIKES", """
5 | SELECT reactions.fid as i, reactions.target_fid as j, count(1) as likes_v
6 | FROM reactions
7 | INNER JOIN fids ON fids.fid = reactions.target_fid
8 | WHERE reaction_type=1
9 | AND reactions.target_fid IS NOT NULL
10 | {condition}
11 | GROUP BY i, j
12 | """)
13 | REPLIES = SQL("REPLIES", """
14 | SELECT fid as i, parent_fid as j, count(1) as replies_v
15 | FROM casts
16 | WHERE parent_hash IS NOT NULL
17 | {condition}
18 | GROUP by i, j
19 | """)
20 | MENTIONS = SQL("MENTIONS", """
21 | WITH mention AS (
22 | SELECT fid as author_fid, mention as mention_fid, timestamp
23 | FROM casts, unnest(casts.mentions) as mention
24 | )
25 | SELECT
26 | author_fid as i, mention_fid as j, count(1) as mentions_v
27 | FROM mention
28 | INNER JOIN fids ON fids.fid = mention.mention_fid
29 | {condition}
30 | GROUP BY i, j
31 | """)
32 | RECASTS = SQL("RECASTS", """
33 | SELECT reactions.fid as i, reactions.target_fid as j, count(1) as recasts_v
34 | FROM reactions
35 | INNER JOIN fids ON fids.fid = reactions.target_fid
36 | WHERE reaction_type=2
37 | AND reactions.target_fid IS NOT NULL
38 | {condition}
39 | GROUP BY i, j
40 | """)
41 | FOLLOWS = SQL("FOLLOWS", """
42 | SELECT
43 | links.fid as i,
44 | links.target_fid as j,
45 | 1 as follows_v
46 | FROM links
47 | INNER JOIN fids ON fids.fid = links.target_fid
48 | WHERE type = 'follow'::text
49 | {condition}
50 | ORDER BY i, j, follows_v desc
51 | """)
52 |
53 | class IVSql:
54 | PRETRUST_TOP_TIER = SQL("PRETRUST_TOP_TIER", """
55 | WITH pt_size AS (
56 | select count(*) as ct from pretrust_v2
57 | where insert_ts=(select max(insert_ts) from pretrust_v2 where strategy_id = {strategy})
58 | and strategy_id = {strategy}
59 | )
60 | SELECT fid as i, 1/ct::numeric as v
61 | FROM pretrust_v2, pt_size
62 | WHERE insert_ts=(select max(insert_ts) from pretrust_v2 where strategy_id = {strategy})
63 | AND strategy_id = {strategy}
64 | """)
65 | PRETRUST_POPULAR = SQL("PRETRUST_POPULAR", """
66 | SELECT
67 | c.fid AS i,
68 | 1/20::numeric as v
69 | FROM
70 | reactions r
71 | INNER JOIN casts c ON c.hash = r.target_cast_hash
72 | INNER JOIN user_data u ON c.fid = u.fid AND u.type = 6
73 | WHERE
74 | r.created_at >= current_timestamp - interval '7' day
75 | GROUP BY
76 | c.fid
77 | ORDER BY
78 | COUNT(*) DESC
79 | LIMIT 20
80 | """)
81 | PRETRUST_OG = SQL("PRETRUST_OG", """
82 | SELECT
83 | distinct fid as i,
84 | 1/11::numeric as v
85 | FROM user_data
86 | WHERE
87 | value in ('dwr.eth', 'varunsrin.eth', 'balajis.eth',
88 | 'vitalik.eth','ccarella.eth','tim',
89 | 'lesgreys.eth','linda','ace',
90 | 'vm','cdixon.eth')
91 | AND type=6
92 | """)
--------------------------------------------------------------------------------
/pipeline/globaltrust/test_data.py:
--------------------------------------------------------------------------------
1 | # standard dependencies
2 | import logging
3 |
4 | # local dependencies
5 | import utils
6 | from config import settings
7 | from . import compute
8 | from .queries import IJVSql
9 |
10 | # 3rd party dependencies
11 | from dotenv import load_dotenv
12 | import pandas as pd
13 |
14 | if __name__ == '__main__':
15 | load_dotenv()
16 | print(settings)
17 |
18 | logger = logging.getLogger()
19 | utils.setup_filelogger(logger, __file__)
20 | logger.setLevel(logging.DEBUG)
21 | utils.setup_consolelogger(logger)
22 |
23 | pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value()
24 |
25 | df = compute._fetch_interactions_df(logger, pg_dsn)
26 | logger.info(utils.df_info_to_string(df, with_sample=True))
27 |
28 | pkl_file = '/tmp/fc_interactions_df.pkl'
29 | logger.info(f"Pickling interactions dataframe to {pkl_file}")
30 | df.to_pickle(pkl_file)
31 | logger.info(f"Done pickling interactions dataframe to {pkl_file}")
32 |
33 | num_ij_pairs = df[df['follows_v'].notna()].groupby(['i', 'j']).ngroups
34 | logger.info(f"Unique i,j follow pairs: {num_ij_pairs}")
35 |
36 | num_selfies = len(df[df['i']==df['j']])
37 | logger.info(f"Number of self followers: {num_selfies}")
38 |
--------------------------------------------------------------------------------
/pipeline/graph/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/graph/__init__.py
--------------------------------------------------------------------------------
/pipeline/graph/export_existingConnections_addr.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | '0x'||encode(coalesce(v1.signer_address, f1.custody_address),'hex') as i,
3 | '0x'||encode(coalesce(v2.signer_address, f2.custody_address),'hex') as j,
4 | lt.v
5 | FROM localtrust as lt
6 | INNER JOIN fids as f1 on (f1.fid = cast(lt.i as int8))
7 | INNER JOIN fids as f2 on (f2.fid = cast(lt.j as int8))
8 | LEFT JOIN verifications as v1 on (v1.fid = f1.fid)
9 | LEFT JOIN verifications as v2 on (v2.fid = f2.fid)
10 | WHERE
11 | lt.strategy_id=1
12 | AND lt.date=(select max(date) from localtrust where strategy_id=1)
13 |
14 |
--------------------------------------------------------------------------------
/pipeline/graph/export_existingConnections_fid.sql:
--------------------------------------------------------------------------------
1 | select
2 | i,
3 | j,
4 | v
5 | from
6 | localtrust
7 | where
8 | strategy_id=1
9 | and date=(select max(date) from localtrust where strategy_id=1)
10 | -- comment out below code for local testing
11 | -- AND i::integer < 10
12 | -- ORDER BY random()
13 | -- LIMIT 1000
--------------------------------------------------------------------------------
/pipeline/graph/export_l1rep6rec3m12enhancedConnections_addr.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | '0x'||encode(coalesce(v1.signer_address, f1.custody_address),'hex') as i,
3 | '0x'||encode(coalesce(v2.signer_address, f2.custody_address),'hex') as j,
4 | lt.v
5 | FROM localtrust as lt
6 | INNER JOIN fids as f1 on (f1.fid = cast(lt.i as int8))
7 | INNER JOIN fids as f2 on (f2.fid = cast(lt.j as int8))
8 | LEFT JOIN verifications as v1 on (v1.fid = f1.fid)
9 | LEFT JOIN verifications as v2 on (v2.fid = f2.fid)
10 | WHERE
11 | lt.strategy_id=3
12 | AND lt.date=(select max(date) from localtrust where strategy_id=3)
--------------------------------------------------------------------------------
/pipeline/graph/export_l1rep6rec3m12enhancedConnections_fid.sql:
--------------------------------------------------------------------------------
1 | select
2 | i,
3 | j,
4 | v
5 | from
6 | localtrust
7 | where
8 | strategy_id=3
9 | and date=(select max(date) from localtrust where strategy_id=3)
10 | -- comment out below code for local testing
11 | -- AND i::integer < 10
12 | -- ORDER BY random()
13 | -- LIMIT 1000
--------------------------------------------------------------------------------
/pipeline/graph/rechunk_graph_pqt.py:
--------------------------------------------------------------------------------
1 | # standard dependencies
2 | from pathlib import Path
3 | import argparse
4 | import sys
5 | import os
6 |
7 | # local dependencies
8 |
9 | # 3rd party dependencies
10 | from loguru import logger
11 | import polars as pl
12 |
13 | def main(indir: Path, outfile: Path):
14 |
15 | logger.info(f"reading parquet files {indir}/*.pqt")
16 | pq_files = [os.path.join(indir, f) for f in os.listdir(indir) if f.endswith('.pqt')]
17 | if not pq_files:
18 | raise FileNotFoundError(f"No parquet files found in {indir}")
19 |
20 | # Read all parquet files into a list of DataFrames
21 | dfs = []
22 | for file in pq_files:
23 | try:
24 | df = pl.read_parquet(file, rechunk=True, low_memory=False)
25 | dfs.append(df)
26 | logger.debug(f"Successfully read {file}")
27 | except Exception as e:
28 | logger.error(f"Error reading {file}: {e}")
29 |
30 | if not dfs:
31 | raise ValueError("No valid parquet files could be read")
32 |
33 | # Concatenate all DataFrames into a single DataFrame
34 | pq_df = pl.concat(dfs)
35 |
36 | logger.info(f"df estimated_size: {pq_df.estimated_size('mb')}")
37 | logger.info(f"df describe: {pq_df.describe()}")
38 | logger.info(f"df sample: {pq_df.sample(n=min(5, len(pq_df)))}")
39 |
40 | logger.info(f"writing to parquet file {outfile}")
41 | pq_df.write_parquet(outfile,
42 | use_pyarrow=True,
43 | statistics=True,
44 | pyarrow_options={
45 | "write_statistics": True,
46 | "row_group_size": 100_000})
47 |
48 | if __name__ == '__main__':
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument("-i", "--indir",
51 | help="input directory with all pqt files",
52 | required=True,
53 | type=lambda f: Path(f).expanduser().resolve())
54 | parser.add_argument("-o", "--outfile",
55 | help="output filename",
56 | required=True,
57 | type=lambda f: Path(f).expanduser().resolve())
58 |
59 | args = parser.parse_args()
60 | print(args)
61 |
62 | logger.remove()
63 | logger.add(sys.stderr, level='INFO')
64 |
65 | if os.path.isdir(args.outfile):
66 | logger.error("-o / --outfile should be a file not a directory")
67 | sys.exit(1)
68 | main(args.indir, args.outfile)
69 |
--------------------------------------------------------------------------------
/pipeline/igraph-docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | igraph:
3 | build:
4 | dockerfile: igraph.Dockerfile
5 | deploy:
6 | mode: replicated
7 | replicas: 2
8 | image: igraph:latest
9 | volumes:
10 | - /home/ubuntu/serve_files:/home/ubuntu/serve_files:z
11 | expose:
12 | - '8000'
13 | restart: "on-failure"
14 | networks:
15 | - farcaster-network
16 | nginx:
17 | image: nginx:latest
18 | volumes:
19 | - ./igraph.nginx.conf:/etc/nginx/nginx.conf:ro
20 | depends_on:
21 | - igraph
22 | ports:
23 | - "4000:4000"
24 | networks:
25 | - farcaster-network
26 |
27 | networks:
28 | farcaster-network:
29 | name: farcaster-network
30 | external: true
31 |
--------------------------------------------------------------------------------
/pipeline/igraph.Dockerfile:
--------------------------------------------------------------------------------
1 | # FROM python:3.12-alpine
2 | # not taking the alpine route because packages like psutil don't install without gcc
3 | FROM python:3.12-slim
4 |
5 | RUN pip install --upgrade pip
6 |
7 | WORKDIR /server
8 |
9 | # don't copy code yet otherwise docker layers will get invalidated every code push
10 | COPY ./requirements.txt /server
11 |
12 | RUN python -m ensurepip --upgrade
13 | RUN python -m pip install --no-cache-dir --upgrade -r requirements.txt
14 |
15 | # copy rest of the code
16 | COPY . /server
17 |
18 | CMD ["uvicorn", "graph.serve_igraph:app", "--host", "0.0.0.0", "--port", "8000", "--timeout-keep-alive", "300"]
--------------------------------------------------------------------------------
/pipeline/igraph.nginx.conf:
--------------------------------------------------------------------------------
1 | user nginx;
2 | worker_processes auto;
3 | worker_rlimit_nofile 30000;
4 |
5 | events {
6 | worker_connections 4096;
7 | }
8 |
9 | http {
10 | keepalive_timeout 65;
11 | keepalive_requests 100000;
12 | tcp_nopush on;
13 | tcp_nodelay on;
14 |
15 | upstream igraph_servers {
16 | server igraph:8000;
17 | }
18 |
19 | server {
20 | listen 4000;
21 |
22 | location / {
23 | proxy_pass http://igraph_servers;
24 | proxy_connect_timeout 300s;
25 | proxy_send_timeout 300s;
26 | proxy_read_timeout 300s;
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/pipeline/logs/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/logs/.placeholder
--------------------------------------------------------------------------------
/pipeline/plugins/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/.placeholder
--------------------------------------------------------------------------------
/pipeline/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/__init__.py
--------------------------------------------------------------------------------
/pipeline/plugins/hooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/hooks/__init__.py
--------------------------------------------------------------------------------
/pipeline/plugins/hooks/common.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlparse, urlunparse
2 | from airflow.models import Variable
3 |
4 | def convert_hostname(url: str):
5 | # Parse the original URL
6 | parsed_url = urlparse(url)
7 |
8 | # Replace the scheme and netloc with the new hostname
9 | new_netloc = Variable.get("airflow_hostname")
10 | new_scheme = "https"
11 |
12 | # Construct the new URL
13 | return urlunparse((new_scheme, new_netloc) + parsed_url[2:])
--------------------------------------------------------------------------------
/pipeline/plugins/hooks/discord.py:
--------------------------------------------------------------------------------
1 | # copied from https://medium.com/@artur.aacs/airflow-send-alerts-with-discord-69f343dfa8dd
2 | import re
3 | from typing import Optional
4 | from datetime import datetime
5 |
6 | from airflow.models import Variable, TaskInstance
7 | from discord_webhook import DiscordWebhook, DiscordEmbed
8 | from hooks.common import convert_hostname
9 |
10 | TI = TaskInstance
11 |
12 | def send_alert_discord(context):
13 | # Get Task Instances variables
14 | last_task: Optional[TaskInstance] = context.get('task_instance')
15 | task_name = last_task.task_id
16 | dag_name = last_task.dag_id
17 | log_link = convert_hostname(last_task.log_url)
18 | execution_date = datetime.fromisoformat(str(context.get('execution_date')))
19 |
20 | # Extract reason for the exception
21 | # try:
22 | # error_message = str(context["exception"])
23 | # error_message = error_message[:1000] + (error_message[1000:] and '...')
24 | # str_start = re.escape("{'reason': ")
25 | # str_end = re.escape('"}.')
26 | # error_message = re.search('%s(.*)%s' % (str_start, str_end), error_message).group(1)
27 | # error_message = "{'reason': " + error_message + ',}'
28 | # except:
29 | # error_message = "Some error that cannot be extracted has occurred. Visit the logs!"
30 |
31 | print('Sending discord alert')
32 |
33 | # Send Alert
34 | webhook = DiscordWebhook(url=Variable.get("discord_webhook")) # Update variable name with your change
35 | print('execution_date', execution_date)
36 | embed = DiscordEmbed(title="Airflow Alert - Task has failed!", color='CC0000', url=log_link, timestamp=execution_date)
37 | embed.add_embed_field(name="DAG", value=dag_name, inline=True)
38 | embed.add_embed_field(name="PRIORITY", value="HIGH", inline=True)
39 | embed.add_embed_field(name="TASK", value=task_name, inline=False)
40 | embed.add_embed_field(name="ERROR", value=str(context["exception"]))
41 | webhook.add_embed(embed)
42 | response = webhook.execute()
43 |
44 | return response
--------------------------------------------------------------------------------
/pipeline/plugins/hooks/pagerduty.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from airflow.providers.pagerduty.notifications.pagerduty import send_pagerduty_notification
4 | from airflow.providers.pagerduty.hooks.pagerduty_events import PagerdutyEventsHook
5 | from airflow.providers.pagerduty.hooks.pagerduty import PagerdutyHook
6 |
7 | from hooks.common import convert_hostname
8 | from airflow.models import Variable, TaskInstance
9 |
10 | # refer to https://github.com/astronomer/pagerduty_airflow_integration_benefits/blob/main/README.md
11 | def send_alert_pagerduty(context):
12 | # Get Task Instances variables
13 | last_task: Optional[TaskInstance] = context.get('task_instance')
14 | log_link = convert_hostname(last_task.log_url)
15 | print('log_link', log_link)
16 |
17 | task_id = last_task.task_id
18 | dag_id = last_task.dag_id
19 | # pagerduty_default needs to be saved on Admin->Variable on the console with Pagerduty Events
20 | integration_key=Variable.get("pagerduty_default")
21 |
22 | print('Sending pagerduty alert')
23 | return PagerdutyEventsHook(integration_key).send_event(
24 | summary=f"Airflow Alert - {dag_id}-{task_id} failed",
25 | severity="critical",
26 | source=f"airflow dag_id: {dag_id}",
27 | dedup_key=f"{dag_id}-{task_id}",
28 | group=f"{dag_id}",
29 | component="airflow",
30 | class_type="Prod Data Pipeline",
31 | custom_details=str(context["exception"]),
32 | links=[{
33 | 'href': log_link,
34 | 'text': 'Link to errored task log'
35 | }],
36 | )
--------------------------------------------------------------------------------
/pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==2.1.3
2 | python-dotenv==1.0.0
3 | igraph==0.11.3
4 | psutil==5.9.8
5 | psycopg2-binary==2.9.9
6 | pydantic-settings==2.2.1
7 | sqlalchemy==1.4.52
8 | requests==2.31.0
9 | loguru==0.7.2
10 | beautifulsoup4==4.12.3
11 | aiohttp==3.9.3
12 | tldextract==5.1.1
13 | niquests==3.5.5
14 | polars==0.20.27
15 | pyarrow==16.1.0
16 | fastapi==0.111.0
17 | apache-airflow==2.9.2
18 | dune-client==1.7.4
19 | openrank-sdk==0.2.2
20 | apache-airflow-providers-ssh==3.12.0
21 | asyncpg==0.29.0
22 | tomlkit==0.13.2
--------------------------------------------------------------------------------
/pipeline/run_cast_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DT_FORMAT='%Y-%m-%d %H:%M:%S'
4 |
5 | # Function to validate date format
6 | function validate_date() {
7 | date_to_check="$1"
8 | date_format="$2"
9 |
10 | # Check if the date matches the format YYYY-mm-dd
11 | if [[ $(uname) == "Darwin" ]]; then
12 | if ! date -j -f "$date_format" "$date_to_check" >/dev/null 2>&1; then
13 | echo "Invalid date format. Use YYYY-mm-dd."
14 | exit 1
15 | fi
16 | else
17 | if ! date -d "$date_to_check" +"$date_format" >/dev/null 2>&1; then
18 | echo "Invalid date format. Use YYYY-mm-dd."
19 | exit 1
20 | fi
21 | fi
22 |
23 | # Check if the date is in the past
24 | today=$(date +"$date_format")
25 | if [ "$date_to_check" \> "$today" ] || [ "$date_to_check" == "$today" ]; then
26 | echo "The date must be in the past and not include today."
27 | exit 1
28 | fi
29 | }
30 |
31 | while getopts dv:f:t:p:m: flag
32 | do
33 | case "${flag}" in
34 | d) DAEMON_FLAG="--daemon";;
35 | v) VENV=${OPTARG};;
36 | f) FILL_TYPE=${OPTARG};;
37 | t) TARGET_DATE=${OPTARG};;
38 | m) TARGET_MONTH=${OPTARG};;
39 | p) POSTGRES=${OPTARG};;
40 | esac
41 | done
42 |
43 | if [ -z "$VENV" ]; then
44 | echo "Usage: $0 -v [venv] -p [postgres] -d -t [fill_type]"
45 | echo ""
46 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
47 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -p eigen8 -d -t backfill"
48 | echo ""
49 | echo "Params:"
50 | echo " [venv] The path where a python3 virtualenv has been created."
51 | echo " [postgres] 'eigen2' or 'eigen8'"
52 | echo " [daemon] Run in daemon mode."
53 | echo " [fill_type] Run in 'default' or 'backfill' or 'gapfill' mode."
54 | echo ""
55 | exit
56 | fi
57 |
58 | if [ ! -z "$POSTGRES" ]; then
59 | PG_OPTION="--postgres $POSTGRES"
60 | fi
61 |
62 | FILL_TYPE=${FILL_TYPE:-default}
63 |
64 | if [ ! -z "$TARGET_DATE" ]; then
65 | validate_date "$TARGET_DATE" "$DT_FORMAT"
66 | DATE_OPTION=(--target-date "$TARGET_DATE")
67 | fi
68 |
69 | # validating TARGET_MONTH in bash is a bit of a pain
70 | # ... let the python script validate it
71 | if [ ! -z "$TARGET_MONTH" ]; then
72 | MONTH_OPTION="--target-month $TARGET_MONTH"
73 | fi
74 |
75 |
76 | # set -x
77 | set -e
78 | set -o pipefail
79 |
80 | function log() {
81 | echo "`date` - $1"
82 | }
83 |
84 | source $VENV/bin/activate
85 | # pip install -r requirements.txt
86 | python3 -m casts.main $PG_OPTION $DAEMON_FLAG -f $FILL_TYPE "${DATE_OPTION[@]}" $MONTH_OPTION
87 | deactivate
88 |
89 | log "Done"
--------------------------------------------------------------------------------
/pipeline/run_channel_metrics.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts "w:v:rd" flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | r) RUN_FLAG="--run";;
9 | d) DRYRUN_FLAG="--dry-run";;
10 | esac
11 | done
12 |
13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$RUN_FLAG" ]; then
14 | echo "Usage: $0 -w [work_dir] -v [venv] -r -d"
15 | echo ""
16 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r"
17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r -d"
18 | echo ""
19 | echo "Params:"
20 | echo " [work_dir] The working directory to read .env file and execute scripts from."
21 | echo " [venv] The path where a python3 virtualenv has been created."
22 | echo " [run] Flag to run the script."
23 | echo " [dryrun] Flag to run the script in dry-run mode."
24 | echo ""
25 | exit
26 | fi
27 |
28 | source $WORK_DIR/.env
29 |
30 | # set -x
31 | set -e
32 | set -o pipefail
33 |
34 | function log() {
35 | echo "`date` - $1"
36 | }
37 |
38 | source $VENV/bin/activate
39 | #pip install -r requirements.txt
40 | python3 -m channels.main_metrics $RUN_FLAG $DRYRUN_FLAG
41 | deactivate
42 |
--------------------------------------------------------------------------------
/pipeline/run_download_pqt_files_v1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # dayOfYear=`date '+%j'`
4 | # hourOfDay=`date '+%H'`
5 | # hourOfYear="$((dayOfYear * 24 + hourOfDay))"
6 | # echo $dayOfYear $hourOfDay $hourOfYear
7 | # hour_interval=48
8 |
9 | # # TODO use the mtime of the existing parquet file and
10 | # # ..if current time - mtime > 1 hour, start compute
11 | # if [ `expr $hourOfYear % $hour_interval` -eq 0 ]; then
12 | # echo "This is hour $hour_interval. Continuing with script."
13 | # else
14 | # echo "This not hour $hour_interval. Exiting now."
15 | # exit 0
16 | # fi
17 |
18 |
19 | while getopts o:s: flag
20 | do
21 | case "${flag}" in
22 | o) OUT_DIR=${OPTARG};;
23 | s) S3_BKT=${OPTARG};;
24 | esac
25 | done
26 |
27 | if [ -z "$OUT_DIR" ] || [ -z "$S3_BKT" ]; then
28 | echo "Usage: $0 -o [out_dir] -s [s3_bkt]"
29 | echo ""
30 | echo "Example: $0 \ "
31 | echo " -i /home/ubuntu/serve_files/lt_engagement_fid.csv \ "
32 | echo " -w . \ "
33 | echo " -v .venv \ "
34 | echo " -o /tmp/personal-graph/ \ "
35 | echo " -s k3l-openrank-farcaster \ "
36 | echo ""
37 | echo "Params:"
38 | echo " [in_csv] The source file to read dataframe from."
39 | echo " [out_dir] The output directory to write the graph file."
40 | echo " [work_dir] The working directory to read .env file and execute scripts from."
41 | echo " [venv] The path where a python3 virtualenv has been created."
42 | echo " [s3_bkt] The S3 bucket to upload the graph file to."
43 | echo " [task] task to run. choose one: graph_reload, generate, fetch_fids, consolidate"
44 | echo " [fids] comma separated fids to run '1,2,3,420,69'"
45 | echo " [run_id] airflow run id. eg) 'manual__2024-07-22T06:46:15.813325+00:00' "
46 | echo " [map_index] airflow map index"
47 | echo ""
48 | exit
49 | fi
50 |
51 | source $WORK_DIR/.env
52 |
53 | set -x
54 | set -e
55 | set -o pipefail
56 |
57 | aws s3 cp s3://${S3_BKT}/personal_graph.parquet $OUT_DIR/personal_graph.parquet
--------------------------------------------------------------------------------
/pipeline/run_eigen2_postgres_sql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | while getopts w: flag
3 | do
4 | case "${flag}" in
5 | w) WORK_DIR=${OPTARG};;
6 | esac
7 | done
8 |
9 | shift $((OPTIND-1))
10 | SQL_STATEMENT="$1"
11 |
12 | if [ -z "$WORK_DIR" ]; then
13 | echo "Usage: $0 -w [work_dir] [sql_statement]"
14 | echo ""
15 | echo "Example: $0 -w . -c 'REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank;'"
16 | echo ""
17 | echo "Params:"
18 | echo " [work_dir] The working directory to read .env file and execute scripts from."
19 | echo " [sql_statement] Optional sql statement to execute."
20 | echo ""
21 | exit 1
22 | fi
23 |
24 | source $WORK_DIR/.env
25 |
26 | DB_HOST=${DB_HOST:-127.0.0.1}
27 | DB_PORT=${DB_PORT:-5432}
28 | DB_USER=${DB_USER:-replicator}
29 | DB_NAME=${DB_NAME:-replicator}
30 | DB_PASSWORD=${DB_PASSWORD:-password} # psql requires PGPASSWORD to be set
31 |
32 | # set -x
33 | set -e
34 | set -o pipefail
35 |
36 | if hash psql 2>/dev/null; then
37 | echo "OK, you have psql in the path. We’ll use that."
38 | PSQL=psql
39 | else
40 | echo "You don't have psql is the path. Let's try /usr/bin"
41 | hash /usr/bin/psql
42 | PSQL=/usr/bin/psql
43 | fi
44 |
45 | PGPASSWORD=$DB_PASSWORD $PSQL -e -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
46 | -c "$SQL_STATEMENT"
--------------------------------------------------------------------------------
/pipeline/run_eigen8_postgres_sql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | while getopts w: flag
3 | do
4 | case "${flag}" in
5 | w) WORK_DIR=${OPTARG};;
6 | esac
7 | done
8 |
9 | shift $((OPTIND-1))
10 | SQL_STATEMENT="$1"
11 |
12 | if [ -z "$WORK_DIR" ]; then
13 | echo "Usage: $0 -w [work_dir] [sql_statement]"
14 | echo ""
15 | echo "Example: $0 -w . -c 'REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank;'"
16 | echo ""
17 | echo "Params:"
18 | echo " [work_dir] The working directory to read .env file and execute scripts from."
19 | echo " [sql_statement] Optional sql statement to execute."
20 | echo ""
21 | exit 1
22 | fi
23 |
24 | source $WORK_DIR/.env
25 |
26 | ALT_REMOTE_DB_HOST=${ALT_REMOTE_DB_HOST:-127.0.0.1}
27 | ALT_REMOTE_DB_PORT=${ALT_REMOTE_DB_PORT:-5432}
28 | ALT_REMOTE_DB_USER=${ALT_REMOTE_DB_USER:-k3l_user}
29 | ALT_REMOTE_DB_NAME=${ALT_REMOTE_DB_NAME:-farcaster}
30 | ALT_REMOTE_DB_PASSWORD=${ALT_REMOTE_DB_PASSWORD:-password} # psql requires PGPASSWORD to be set
31 |
32 | # set -x
33 | set -e
34 | set -o pipefail
35 |
36 | if hash psql 2>/dev/null; then
37 | echo "OK, you have psql in the path. We’ll use that."
38 | PSQL=psql
39 | else
40 | echo "You don't have psql is the path. Let's try /usr/bin"
41 | hash /usr/bin/psql
42 | PSQL=/usr/bin/psql
43 | fi
44 |
45 | PGPASSWORD=$ALT_REMOTE_DB_PASSWORD $PSQL -e -h $ALT_REMOTE_DB_HOST \
46 | -p $ALT_REMOTE_DB_PORT -U $ALT_REMOTE_DB_USER -d $ALT_REMOTE_DB_NAME \
47 | -c "$SQL_STATEMENT"
--------------------------------------------------------------------------------
/pipeline/run_fetch_channel_top_caster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts w:i:v:c: flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | c) CSV_PATH=${OPTARG};;
9 | esac
10 | done
11 |
12 | shift $((OPTIND-1))
13 | CHANNEL_IDS="$1"
14 |
15 | if [ -z "$VENV" ] || [ -z "$CSV_PATH" ]; then
16 | echo "Usage: $0 -w [work_dir] -v [venv] -c [csv_path] [channel_ids]"
17 | echo ""
18 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Top_Channels.csv"
19 | echo ""
20 | echo "Params:"
21 | echo " [work_dir] The working directory to read .env file and execute scripts from."
22 | echo " [venv] The path where a python3 virtualenv has been created."
23 | echo " [csv_path] The path to the CSV file."
24 | echo ""
25 | exit 1
26 | fi
27 |
28 | log() {
29 | echo "`date` - $1"
30 | }
31 |
32 | log "Starting script with parameters: WORK_DIR=${WORK_DIR}, VENV=${VENV}, CSV_PATH=${CSV_PATH}"
33 |
34 | source $WORK_DIR/.env
35 |
36 | set -e
37 | set -o pipefail
38 |
39 | function log() {
40 | echo "`date` - $1"
41 | }
42 |
43 | log "Activating virtual environment"
44 | source $VENV/bin/activate
45 | # pip install -r requirements.txt
46 | log "Executing task"
47 | python3 -m channels.main_fetch_channel_top_casters -c "$CSV_PATH"
48 | deactivate
49 |
50 |
--------------------------------------------------------------------------------
/pipeline/run_fetch_top_caster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts v:s: flag
4 | do
5 | case "${flag}" in
6 | v) VENV=${OPTARG};;
7 | esac
8 | done
9 |
10 | if [ -z "$VENV" ]; then
11 | echo "Usage: $0 -v [venv]"
12 | echo ""
13 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
14 | echo ""
15 | echo "Params:"
16 | echo " [venv] The path where a python3 virtualenv has been created."
17 | echo ""
18 | exit
19 | fi
20 |
21 | # set -x
22 | set -e
23 | set -o pipefail
24 |
25 | function log() {
26 | echo "`date` - $1"
27 | }
28 |
29 | source $VENV/bin/activate
30 | # pip install -r requirements.txt
31 | python3 -m casts.main_fetch_top_casters
32 | deactivate
33 |
--------------------------------------------------------------------------------
/pipeline/run_fetch_top_spammers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts v:s: flag
4 | do
5 | case "${flag}" in
6 | v) VENV=${OPTARG};;
7 | esac
8 | done
9 |
10 | if [ -z "$VENV" ] ; then
11 | echo "Usage: $0 -v [venv]"
12 | echo ""
13 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
14 | echo ""
15 | echo "Params:"
16 | echo " [venv] The path where a python3 virtualenv has been created."
17 | echo ""
18 | exit
19 | fi
20 |
21 | # set -x
22 | set -e
23 | set -o pipefail
24 |
25 | function log() {
26 | echo "`date` - $1"
27 | }
28 |
29 | source $VENV/bin/activate
30 | # pip install -r requirements.txt
31 | python3 -m casts.main_fetch_top_spammers
32 | deactivate
33 |
--------------------------------------------------------------------------------
/pipeline/run_frame_scraper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts d:v: flag
4 | do
5 | case "${flag}" in
6 | d) DAEMON=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | esac
9 | done
10 |
11 | if [ -z "$VENV" ]; then
12 | echo "Usage: $0 -v [venv]"
13 | echo ""
14 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
15 | echo ""
16 | echo "Params:"
17 | echo " [venv] The path where a python3 virtualenv has been created."
18 | echo ""
19 | exit
20 | fi
21 |
22 | # set -x
23 | set -e
24 | set -o pipefail
25 |
26 | function log() {
27 | echo "`date` - $1"
28 | }
29 |
30 | DAEMON=${DAEMON:-false}
31 |
32 | source $VENV/bin/activate
33 | # pip install -r requirements.txt
34 | mkdir -p tmp/tldcache
35 | export TLDEXTRACT_CACHE=tmp/tldcache
36 | python3 -m frames.main -d $DAEMON
37 | deactivate
38 |
--------------------------------------------------------------------------------
/pipeline/run_graph_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts i:o:p:w:v: flag
4 | do
5 | case "${flag}" in
6 | i) IN_FILE=${OPTARG};;
7 | o) OUT_DIR=${OPTARG};;
8 | p) OUT_PREFIX=${OPTARG};;
9 | w) WORK_DIR=${OPTARG};;
10 | v) VENV=${OPTARG};;
11 | esac
12 | done
13 |
14 | if [ -z "$IN_FILE" ] || [ -z "$OUT_DIR" ] || [ -z "$OUT_PREFIX" ] || [ -z "$WORK_DIR" ] || [ -z "$VENV" ]; then
15 | echo "Usage: $0 -w [work_dir] -v [venv] -i [in_file] -o [out_dir] -p [out_prefix]"
16 | echo ""
17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -i /tmp -o /tmp -p test"
18 | echo ""
19 | echo "Params:"
20 | echo " [work_dir] The working directory to read .env file and execute scripts from."
21 | echo " [venv] The path where a python3 virtualenv has been created."
22 | echo " [in_file] The input localtrust (i,j,v edge list) csv file."
23 | echo " [out_dir] The output directory to write the graph file."
24 | echo " [out_prefix] The prefix of the output graph files."
25 | echo ""
26 | exit
27 | fi
28 |
29 |
30 | source $WORK_DIR/.env
31 |
32 | # set -x
33 | set -e
34 | set -o pipefail
35 |
36 | function log() {
37 | echo "`date` - $1"
38 | }
39 |
40 | mkdir -p $OUT_DIR
41 |
42 | source $VENV/bin/activate
43 | #pip install -r requirements.txt
44 | python3 -m graph.gen_igraph -i $IN_FILE -o $OUT_DIR -p $OUT_PREFIX
45 | touch $OUT_DIR/${OUT_PREFIX}_SUCCESS
46 | deactivate
47 |
--------------------------------------------------------------------------------
/pipeline/run_notify_channel_daily_trending.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts "w:v:c:d" flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | c) CSV_PATH=${OPTARG};;
9 | d) DRYRUN_FLAG="--dry-run";;
10 | esac
11 | done
12 |
13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$CSV_PATH" ]; then
14 | echo "Usage: $0 -w [work_dir] -v [venv] -c [csv_path] -d"
15 | echo ""
16 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Trending_Channels.csv"
17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Trending_Channels.csv -d"
18 | echo ""
19 | echo "Params:"
20 | echo " [work_dir] The working directory to read .env file and execute scripts from."
21 | echo " [venv] The path where a python3 virtualenv has been created."
22 | echo " [csv_path] Path to CSV file."
23 | echo " [dryrun] Flag to run the script in dry-run mode."
24 | echo ""
25 | exit
26 | fi
27 |
28 | source $WORK_DIR/.env
29 |
30 | # set -x
31 | set -e
32 | set -o pipefail
33 |
34 | function log() {
35 | echo "`date` - $1"
36 | }
37 |
38 | source $VENV/bin/activate
39 | #pip install -r requirements.txt
40 | python3 -m channels.main_notify_daily_trending -c "$CSV_PATH" $DRYRUN_FLAG
41 | deactivate
42 |
--------------------------------------------------------------------------------
/pipeline/run_notify_channel_leaderboard.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts "w:v:rd" flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | r) RUN_FLAG="--run";;
9 | d) DRYRUN_FLAG="--dry-run";;
10 | esac
11 | done
12 |
13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$RUN_FLAG" ]; then
14 | echo "Usage: $0 -w [work_dir] -v [venv] -r -d"
15 | echo ""
16 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r"
17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r -d"
18 | echo ""
19 | echo "Params:"
20 | echo " [work_dir] The working directory to read .env file and execute scripts from."
21 | echo " [venv] The path where a python3 virtualenv has been created."
22 | echo " [run] Flag to run the script."
23 | echo " [dryrun] Flag to run the script in dry-run mode."
24 | echo ""
25 | exit
26 | fi
27 |
28 | source $WORK_DIR/.env
29 |
30 | # set -x
31 | set -e
32 | set -o pipefail
33 |
34 | function log() {
35 | echo "`date` - $1"
36 | }
37 |
38 | source $VENV/bin/activate
39 | #pip install -r requirements.txt
40 | python3 -m channels.main_notify_leaderboard $RUN_FLAG $DRYRUN_FLAG
41 | deactivate
42 |
--------------------------------------------------------------------------------
/pipeline/run_notify_channel_weekly_mods.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts "w:v:b:s:d" flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | b) BOTS_CSV=${OPTARG};;
9 | s) SINCE_DATETIME=${OPTARG};;
10 | d) DRYRUN_FLAG="--dry-run";;
11 | esac
12 | done
13 |
14 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$BOTS_CSV" ] || [ -z "$SINCE_DATETIME" ]; then
15 | echo "Usage: $0 -w [work_dir] -v [venv] -b [bots_csv] -s [since_datetime] -d"
16 | echo ""
17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -b channels/Bot_Fids.csv -s '2025-04-23 16:30:00+00:00'"
18 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -b channels/Bot_Fids.csv -s '2025-04-23 16:30:00+00:00' -d"
19 | echo ""
20 | echo "Params:"
21 | echo " [work_dir] The working directory to read .env file and execute scripts from."
22 | echo " [venv] The path where a python3 virtualenv has been created."
23 | echo " [bots_csv] The path to the CSV file that has list of mod bots."
24 | echo " [since_datetime] The datetime to get notifications since."
25 | echo " [dryrun] Flag to run the script in dry-run mode."
26 | echo ""
27 | exit
28 | fi
29 |
30 | source $WORK_DIR/.env
31 |
32 | # set -x
33 | set -e
34 | set -o pipefail
35 |
36 | function log() {
37 | echo "`date` - $1"
38 | }
39 |
40 | source $VENV/bin/activate
41 | #pip install -r requirements.txt
42 | python3 -m channels.main_notify_weekly_mods -b "$BOTS_CSV" -s "$SINCE_DATETIME" $DRYRUN_FLAG
43 | deactivate
44 |
--------------------------------------------------------------------------------
/pipeline/run_update_channel_points.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts w:v:t:p:g: flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | t) TASK=${OPTARG};;
9 | p) POSTGRES=${OPTARG};;
10 | g) GAPFILL_DATE=${OPTARG};;
11 | esac
12 | done
13 |
14 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$TASK" ]; then
15 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task]"
16 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task] -p [postgres]"
17 | echo "Usage: $0 -w [work_dir] -v [venv] -t gapfill -p [postgres] -g [gapfill_date] "
18 | echo ""
19 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t genesis"
20 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t compute"
21 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t update"
22 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t update -p eigen8 -g 2025-04-01"
23 | echo ""
24 | echo "Params:"
25 | echo " [work_dir] The working directory to read .env file and execute scripts from."
26 | echo " [venv] The path where a python3 virtualenv has been created."
27 | echo " [task] The task to perform: daily or distrib."
28 | echo " [postgres] The name of the postgres database to connect to."
29 | echo " [gapfill_date] The date to use for gapfilling in YYYY-MM-DD format."
30 | echo ""
31 | exit
32 | fi
33 |
34 | if [ ! -z "$POSTGRES" ]; then
35 | PG_OPTION="--postgres $POSTGRES"
36 | fi
37 |
38 | if [ "$TASK" = "gapfill" ]; then
39 | if [ -z "$GAPFILL_DATE" ]; then
40 | echo "Please specify -g (gapfill_date) for the gapfill task."
41 | exit 1
42 | fi
43 | fi
44 |
45 | # validating TARGET_MONTH in bash is a bit of a pain
46 | # ... let the python script validate it
47 | if [ ! -z "$GAPFILL_DATE" ]; then
48 | GAPFILL_OPTION="--gapfill-date $GAPFILL_DATE"
49 | fi
50 |
51 | source $WORK_DIR/.env
52 |
53 | # set -x
54 | set -e
55 | set -o pipefail
56 |
57 | function log() {
58 | echo "`date` - $1"
59 | }
60 |
61 | source $VENV/bin/activate
62 | #pip install -r requirements.txt
63 | python3 -m channels.main_points -t "$TASK" $PG_OPTION $GAPFILL_OPTION
64 | deactivate
65 |
--------------------------------------------------------------------------------
/pipeline/run_update_channel_tokens.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts w:v:t:s:r:p: flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | v) VENV=${OPTARG};;
8 | t) TASK=${OPTARG};;
9 | s) SCOPE=${OPTARG};;
10 | r) REASON=${OPTARG};;
11 | p) POSTGRES=${OPTARG};;
12 | esac
13 | done
14 |
15 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$TASK" ]; then
16 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task]"
17 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task] -s [scope] -r [reason] -p [postgres]"
18 | echo ""
19 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t prep -s weekly -r reason -p eigen8"
20 | echo " $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t distrib"
21 | echo " $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t verify"
22 | echo ""
23 | echo "Params:"
24 | echo " [work_dir] The working directory to read .env file and execute scripts from."
25 | echo " [venv] The path where a python3 virtualenv has been created."
26 | echo " [task] The task to perform: prep or distrib or verify."
27 | echo " [scope] The scope of channels to import: airdrop or daily calculation."
28 | echo " [reason] The reason for the distribution."
29 | echo " [postgres] The name of the postgres database to connect to."
30 | echo ""
31 | exit
32 | fi
33 |
34 | if [ "$TASK" = "prep" ]; then
35 | if [ -z "$SCOPE" ] || [ -z "$REASON" ]; then
36 | echo "Please specify -s (scope) and -r (reason) for the prep task."
37 | exit 1
38 | fi
39 | fi
40 |
41 | if [ ! -z "$POSTGRES" ]; then
42 | PG_OPTION="--postgres $POSTGRES"
43 | fi
44 |
45 | source $WORK_DIR/.env
46 |
47 | # set -x
48 | set -e
49 | set -o pipefail
50 |
51 | function log() {
52 | echo "`date` - $1"
53 | }
54 |
55 | source $VENV/bin/activate
56 | #pip install -r requirements.txt
57 | if [ "$TASK" = "prep" ]; then
58 | python3 -m channels.main_tokens -t prep -s "$SCOPE" -r "$REASON" $PG_OPTION
59 | deactivate
60 | elif [ "$TASK" = "distrib" ]; then
61 | python3 -m channels.main_tokens -t distrib $PG_OPTION
62 | deactivate
63 | elif [ "$TASK" = "verify" ]; then
64 | python3 -m channels.main_tokens -t verify $PG_OPTION
65 | deactivate
66 | else
67 | echo "Invalid task specified. Use 'prep', 'distrib' or 'verify'."
68 | exit 1
69 | fi
70 |
--------------------------------------------------------------------------------
/pipeline/samples/pretrust.csv:
--------------------------------------------------------------------------------
1 | i,v
2 | 2,0.5
3 | 3,0.5
4 |
--------------------------------------------------------------------------------
/pipeline/schema/globaltrust_config.sql:
--------------------------------------------------------------------------------
1 | --
2 | -- PostgreSQL database dump
3 | --
4 |
5 | -- Dumped from database version 16.2
6 | -- Dumped by pg_dump version 16.2
7 |
8 | SET statement_timeout = 0;
9 | SET lock_timeout = 0;
10 | SET idle_in_transaction_session_timeout = 0;
11 | SET client_encoding = 'UTF8';
12 | SET standard_conforming_strings = on;
13 | SELECT pg_catalog.set_config('search_path', '', false);
14 | SET check_function_bodies = false;
15 | SET xmloption = content;
16 | SET client_min_messages = warning;
17 | SET row_security = off;
18 |
19 | SET default_tablespace = '';
20 |
21 | SET default_table_access_method = heap;
22 |
23 | --
24 | -- Name: globaltrust_config; Type: TABLE; Schema: public; Owner: k3l_user
25 | --
26 |
27 | CREATE TABLE public.globaltrust_config (
28 | strategy_id integer NOT NULL,
29 | strategy_name character varying(255) NOT NULL,
30 | pretrust text,
31 | localtrust text,
32 | alpha real,
33 | date date DEFAULT CURRENT_TIMESTAMP NOT NULL
34 | );
35 |
36 |
37 | ALTER TABLE public.globaltrust_config OWNER TO k3l_user;
38 |
39 | --
40 | -- Data for Name: globaltrust_config; Type: TABLE DATA; Schema: public; Owner: k3l_user
41 | --
42 |
43 | COPY public.globaltrust_config (strategy_id, strategy_name, pretrust, localtrust, alpha, date) FROM stdin;
44 | 1 follows pretrustAllEqually existingConnections 0.5 2023-12-07
45 | 3 engagement pretrustAllEqually l1rep6rec3m12enhancedConnections 0.5 2023-12-07
46 | 5 activity pretrustAllEqually l1rep1rec1m1enhancedConnections 0.5 2023-12-07
47 | 7 OG circles pretrustSpecificUsernames existingConnections 0.5 2023-12-07
48 | 9 OG engagement pretrustSpecificUsernames l1rep6rec3m12enhancedConnections 0.5 2023-12-07
49 | 11 OG activity pretrustSpecificUsernames l1rep1rec1m1enhancedConnections 0.5 2023-12-07
50 | 1 follows pretrustTopTier existingConnections 0.5 2024-03-14
51 | 3 engagement pretrustTopTier l1rep6rec3m12enhancedConnections 0.5 2024-03-14
52 | 1 follows pretrustTopTier existingConnections 0.5 2024-09-27
53 | 3 engagement pretrustTopTier l1rep6rec3m12enhancedConnections 0.5 2024-09-27
54 | 9 v3engagement v2pretrustTopTier followsboostedl1rep3rec6m12 0.5 2024-09-27
55 | \.
56 |
57 |
58 | --
59 | -- Name: globaltrust_config globaltrust_config_pkey; Type: CONSTRAINT; Schema: public; Owner: k3l_user
60 | --
61 |
62 | ALTER TABLE ONLY public.globaltrust_config
63 | ADD CONSTRAINT globaltrust_config_pkey PRIMARY KEY (strategy_id, date);
64 |
65 |
66 | --
67 | -- Name: TABLE globaltrust_config; Type: ACL; Schema: public; Owner: k3l_user
68 | --
69 |
70 | GRANT SELECT,REFERENCES ON TABLE public.globaltrust_config TO k3l_readonly;
71 |
72 |
73 | --
74 | -- PostgreSQL database dump complete
75 | --
76 |
77 |
--------------------------------------------------------------------------------
/pipeline/scripts/archived/run_create_degen_db_functions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -x
4 | set -e
5 |
6 | while getopts v:t: flag
7 | do
8 | case "${flag}" in
9 | v) VENV=${OPTARG};;
10 | t) TASK=${OPTARG};;
11 | esac
12 | done
13 |
14 | if [ -z "$VENV" ]; then
15 | echo "Usage: $0 -v [venv]"
16 | echo ""
17 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
18 | echo ""
19 | echo "Params:"
20 | echo " [venv] The path where a python3 virtualenv has been created."
21 | echo " [task] The task to perform: 'extract' or 'insert_scores'."
22 | echo ""
23 | exit
24 | fi
25 |
26 | # set -x
27 | set -e
28 | set -o pipefail
29 |
30 | source $VENV/bin/activate
31 | # pip install -r requirements.txt
32 |
33 | echo "Executing task: $TASK"
34 | if [ "$TASK" = "extract" ]; then
35 | python3 -m degen.create_degen_sql_functions
36 | elif [ "$TASK" = "insert_scores" ]; then
37 | python3 -m degen.calculate_rank
38 | else
39 | echo "Invalid task specified. Use 'extract' or 'insert_scores'."
40 | exit 1
41 | fi
42 | deactivate
43 |
--------------------------------------------------------------------------------
/pipeline/scripts/archived/run_sandbox_backup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source .env
4 |
5 | set -x
6 | set -e # Exit immediately if a command exits with a non-zero status
7 | set -o pipefail # Ensure pipeline failures are propagated
8 |
9 |
10 | # TODO: move this to cli args
11 | DATE_SUFFIX=$(date +"%Y%m%d" )
12 | BACKUP_DIR="/tmp/sandbox-backup-$DATE_SUFFIX"
13 | BACKUP_FILE="sandbox_pgdump"
14 | S3_BUCKET='k3l-farcaster-backups'
15 | S3_PREFIX='pg_dump/'
16 |
17 | #DB details
18 | DB_NAME=$SANDBOX_DB_NAME
19 | DB_USER=$SANDBOX_DB_USER
20 | DB_PASSWORD=$SANDBOX_DB_PASSWORD
21 | DB_HOST=$SANDBOX_DB_HOST
22 | DB_PORT=$SSH_LISTEN_PORT
23 |
24 | rm -rf "$BACKUP_DIR"
25 | mkdir -p "$BACKUP_DIR"
26 |
27 | # Perform the backup
28 | echo "Starting backup..."
29 | set +x # Disable command echoing
30 | export PGPASSWORD="$DB_PASSWORD"
31 | set -x # Re-enable command echoing
32 | pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
33 | -j 1 \
34 | -Fd \
35 | -f "$BACKUP_DIR/$BACKUP_FILE"
36 | unset PGPASSWORD
37 |
38 | # Check if backup was successful
39 | if [ $? -eq 0 ]; then
40 | echo "Backup completed successfully"
41 |
42 | # Compress the backup
43 | tar czf "$BACKUP_DIR/$BACKUP_FILE.tgz" -C "$BACKUP_DIR" $BACKUP_FILE
44 | echo "Backup compressed"
45 |
46 | # Upload to S3
47 | echo "Uploading backup to S3..."
48 | aws s3 cp "$BACKUP_DIR/$BACKUP_FILE.tgz" "s3://$S3_BUCKET/$S3_PREFIX$BACKUP_FILE.tgz"
49 |
50 | if [ $? -eq 0 ]; then
51 | echo "Backup successfully uploaded to S3"
52 | rm -rf "$BACKUP_DIR"
53 | else
54 | echo "Failed to upload backup to S3"
55 | exit 1
56 | fi
57 | else
58 | echo "Backup failed"
59 | exit 1
60 | fi
61 |
62 | exit 0
63 |
--------------------------------------------------------------------------------
/pipeline/scripts/archived/run_urlextract_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts w: flag
4 | do
5 | case "${flag}" in
6 | w) WORK_DIR=${OPTARG};;
7 | esac
8 | done
9 |
10 | if [ -z "$WORK_DIR" ]; then
11 | echo "Usage: $0 -w [work_dir]"
12 | echo ""
13 | echo "Example: $0 -w ."
14 | echo ""
15 | echo "Params:"
16 | echo " [work_dir] The working directory to read .env file and execute scripts from."
17 | echo ""
18 | exit
19 | fi
20 |
21 | source $WORK_DIR/.env
22 |
23 | DB_HOST=${DB_HOST:-127.0.0.1}
24 | DB_PORT=${DB_PORT:-5432}
25 | DB_USER=${DB_USER:-replicator}
26 | DB_NAME=${DB_NAME:-replicator}
27 | DB_PASSWORD=${DB_PASSWORD:-password} # psql requires PGPASSWORD to be set
28 |
29 | # set -x
30 | set -e
31 | set -o pipefail
32 |
33 | if hash psql 2>/dev/null; then
34 | echo "OK, you have psql in the path. We’ll use that."
35 | PSQL=psql
36 | else
37 | echo "You don't have psql is the path. Let's try /usr/bin"
38 | hash /usr/bin/psql
39 | PSQL=/usr/bin/psql
40 | fi
41 |
42 | function log() {
43 | echo "`date` - $1"
44 | }
45 |
46 | log "Inserting into k3l_url_labels"
47 | PGPASSWORD=$DB_PASSWORD \
48 | $PSQL -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
49 | -f $WORK_DIR/frames/incremental_load_labels.sql
50 |
51 | wait $!
52 |
53 | log "Inserting into k3l_cast_embed_url_mapping"
54 | PGPASSWORD=$DB_PASSWORD \
55 | $PSQL -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
56 | -f $WORK_DIR/frames/incremental_load_cast_mapping.sql
57 |
58 | wait $!
59 |
60 | this_name=`basename "$0"`
61 | log "$this_name done!"
--------------------------------------------------------------------------------
/pipeline/scripts/one_off/diff_db_table.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 | print("Not implemented")
3 | pass
--------------------------------------------------------------------------------
/pipeline/scripts/one_off/run_cast_pipeline_gapfills.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | date_format='%Y-%m-%d'
4 |
5 | # Function to validate date format
6 | function validate_date() {
7 | date_to_check=$1
8 |
9 | # Check if the date matches the format YYYY-mm-dd
10 | if [[ $(uname) == "Darwin" ]]; then
11 | if ! date -j -f "$date_format" "$date_to_check" >/dev/null 2>&1; then
12 | echo "Invalid date format. Use YYYY-mm-dd."
13 | exit 1
14 | fi
15 | else
16 | if ! date -d "$date_to_check" +"$date_format" >/dev/null 2>&1; then
17 | echo "Invalid date format. Use YYYY-mm-dd."
18 | exit 1
19 | fi
20 | fi
21 |
22 | # Check if the date is in the past
23 | today=$(date +"$date_format")
24 | if [ "$date_to_check" \> "$today" ] || [ "$date_to_check" == "$today" ]; then
25 | echo "The date must be in the past and not include today."
26 | exit 1
27 | fi
28 | }
29 |
30 | while getopts v:s:p:e:l: flag
31 | do
32 | case "${flag}" in
33 | v) VENV=${OPTARG};;
34 | s) START_DATE=${OPTARG};;
35 | e) END_DATE=${OPTARG};;
36 | p) POSTGRES=${OPTARG};;
37 | l) SLEEP_TIME=${OPTARG};;
38 | esac
39 | done
40 |
41 | if [ -z "$VENV" ] || [ -z "$START_DATE" ] || [ -z "$END_DATE" ]; then
42 | echo "Usage: $0 -v [venv] -s [start_date] -e [end_date]"
43 | echo "Usage: $0 -v [venv] -s [start_date] -e [end_date] -p [postgres] -l [sleep_time]"
44 | echo ""
45 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -s 2025-02-01 -e 2025-02-05"
46 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -s 2025-02-01 -e 2025-02-05 -p eigen8"
47 | echo ""
48 | echo "Params:"
49 | echo " [venv] The path where a python3 virtualenv has been created."
50 | echo " [start_date] The date to start the gapfilling process."
51 | echo " [end_date] The date to end the gapfilling process."
52 | echo " [postgres] 'eigen2' or 'eigen8'"
53 | echo " [sleep_time] The amount of time to sleep between gapfill runs."
54 | echo ""
55 | exit
56 | fi
57 |
58 | if [ ! -z "$POSTGRES" ]; then
59 | PG_OPTION="--postgres $POSTGRES"
60 | fi
61 |
62 | validate_date $START_DATE
63 | validate_date $END_DATE
64 |
65 | SLEEP_TIME=${SLEEP_TIME:-30s}
66 |
67 |
68 | # set -x
69 | set -e
70 | set -o pipefail
71 |
72 | function log() {
73 | echo "`date` - $1"
74 | }
75 |
76 | source $VENV/bin/activate
77 | # pip install -r requirements.txt
78 | while [[ $START_DATE < $END_DATE ]]; do
79 | DATE_OPTION=(--target-date "$START_DATE 00:00:00")
80 | FILL_TYPE="gapfill"
81 | DAEMON_FLAG=""
82 | log "Running gapfill for $START_DATE"
83 | python3 -m casts.main $PG_OPTION $DAEMON_FLAG -f $FILL_TYPE "${DATE_OPTION[@]}"
84 | log "Sleeping for $SLEEP_TIME"
85 | sleep $SLEEP_TIME
86 | START_DATE=$(date -I -d "$START_DATE + 1 day")
87 | done
88 | deactivate
89 |
90 | log "Done"
91 |
--------------------------------------------------------------------------------
/pipeline/sshtunnel.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3.8
2 |
3 | RUN apk add --no-cache autossh libressl
4 |
5 | RUN mkdir -p ~/.ssh
6 |
7 | ENTRYPOINT ["/usr/bin/autossh", \
8 | "-M", "0", "-T", "-N", "-g", "-v", \
9 | "-oStrictHostKeyChecking=no", \
10 | "-oServerAliveInterval=180", \
11 | "-oUserKnownHostsFile=/dev/null", \
12 | "-oGlobalKnownHostsFile=/dev/null", \
13 | "-i/root/.ssh/id_rsa"]
--------------------------------------------------------------------------------
/pipeline/timer.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://realpython.com/python-timer/#the-python-timer-code
2 | import time
3 | from contextlib import ContextDecorator
4 | from dataclasses import dataclass, field
5 | from typing import Any, Callable, ClassVar, Dict, Optional
6 |
7 | class TimerError(Exception):
8 | """A custom exception used to report errors in use of Timer class"""
9 |
10 | @dataclass
11 | class Timer(ContextDecorator):
12 | """Time your code using a class, context manager, or decorator
13 | Class:
14 | ======
15 | t = Timer(name="class")
16 | t.start()
17 | # Do something
18 | t.stop()
19 | Context Manager:
20 | ================
21 | with Timer(name="context manager"):
22 | # Do something
23 | Decorator:
24 | ==========
25 | @Timer(name="decorator")
26 | def stuff():
27 | # Do something
28 | """
29 |
30 | timers: ClassVar[Dict[str, float]] = {}
31 | name: Optional[str] = None
32 | text: str = "Elapsed time: {n} took {t:0.4f} seconds"
33 | logger: Optional[Callable[[str], None]] = print
34 | _start_time: Optional[float] = field(default=None, init=False, repr=False)
35 |
36 | def __post_init__(self) -> None:
37 | """Initialization: add timer to dict of timers"""
38 | if self.name:
39 | self.timers.setdefault(self.name, 0)
40 |
41 | def start(self) -> None:
42 | """Start a new timer"""
43 | if self._start_time is not None:
44 | raise TimerError(f"Timer is running. Use .stop() to stop it")
45 | self.logger("Start a new timer: {n}".format(n=self.name))
46 | self._start_time = time.perf_counter()
47 |
48 | def stop(self) -> float:
49 | """Stop the timer, and report the elapsed time"""
50 | if self._start_time is None:
51 | raise TimerError(f"Timer is not running. Use .start() to start it")
52 |
53 | # Calculate elapsed time
54 | elapsed_time = time.perf_counter() - self._start_time
55 | self._start_time = None
56 |
57 | # Report elapsed time
58 | if self.logger:
59 | self.logger(self.text.format(n=self.name, t=elapsed_time))
60 | if self.name:
61 | self.timers[self.name] += elapsed_time
62 |
63 | return elapsed_time
64 |
65 | def __enter__(self) -> "Timer":
66 | """Start a new timer as a context manager"""
67 | self.start()
68 | return self
69 |
70 | def __exit__(self, *exc_info: Any) -> None:
71 | """Stop the context manager timer"""
72 | self.stop()
--------------------------------------------------------------------------------
/pipeline/tmp/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/tmp/.placeholder
--------------------------------------------------------------------------------
/scripts/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/scripts/.placeholder
--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/.env.sample:
--------------------------------------------------------------------------------
1 | NGINX_CONFIG="/etc/nginx/sites-enabled/graph.cast.k3l.io"
2 | WORK_DIR="/home/ubuntu/graphcast_jobs"
3 | REMOTE_USER="ubuntu"
4 | REMOTE_DIR="/home/ubuntu/graphcast_jobs/"
5 | SSH_PRIV_KEY="/home/ubuntu/.ssh/id_graphcast_jobs"
6 |
7 |
--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/README.md:
--------------------------------------------------------------------------------
1 | We use letsencrypt to issue SSL certs for our domains.
2 |
3 | # Step 1. graph.castN.k3l.io
4 |
5 | Example, graph.cast9.k3l.io. This sub-domain is not load-balanced but is very useful when we want to simulate a blue-green deployment. Also, setting up this sub-domain also makes the next step simple.
6 |
7 | A typical crontab to both **install** as well as **renew** certs looks like this:
8 | ```
9 | 1 0 */7 * * sudo certbot run --nginx -d graph.cast9.k3l.io -m ops@karma3labs.com --agree-tos -n
10 | ```
11 | This crontab assumes that `/etc/nginx/sites-available/` is aleady configured for the sub-domain name.
12 |
13 | This repo has a sample nginx file that you can use. **REMEMBER** to replace `N` with your preferred number.
14 | Also, **REMEMBER** to soft link the config file `sudo ln -s /etc/nginx/sites-available/graph.castN.k3l.io /etc/nginx/sites-enabled/`
15 | **NOTE** the sample file does not have ssl config because certbot will add the appropriate config when certbot is run for the first time `sudo certbot run --nginx -d graph.castN.k3l.io -m ops@karma3labs.com --agree-tos -n`
16 |
17 | # Step 2. graph.cast.k3l.io
18 | The sub-domain `graph.cast.k3l.io` is load-balanced across multiple machines. When renewing certs, we cannot have certs renewed from multiple machines and have them invalidate the others. So, we renew certs on 1 machine and push the cert to all the other machines.
19 |
20 | The `install_certs.sh` script takes care of renewing the cert while `push_certs.sh` pushes the cert to the other machines.
21 |
22 | #### Pre-req
23 | `/etc/nginx/sites-available/` should have a config for `graph.cast.k3l.io`
24 |
25 | This repo has a sample nginx file that you can use. **REMEMBER** to replace `CHANGME_OPENSSL_RAND_KEY` with a strong api key. Also, **REMEMBER** to soft link the config file `sudo ln -s /etc/nginx/sites-available/graph.cast.k3l.io /etc/nginx/sites-enabled/`
26 |
27 | #### Cronjobs
28 | A typical crontab on the **"primary"** host looks like this:
29 | ```
30 | 15 0 */7 * * sudo certbot run --nginx -d graph.cast.k3l.io -m ops@karma3labs.com --agree-tos -n >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; sudo nginx -s reload >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; date >> /var/log/farcaster-graph/graphcast_jobs.log ; cd /home/ubuntu/graphcast_jobs; ./push_certs.sh -h 162.55.109.106 >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1;
31 | ```
32 | 1. renew cert `sudo certbot run --nginx -d graph.cast.k3l.io -m ops@karma3labs.com --agree-tos -n`
33 | 2. reload nginx locally to make sure cert is fine `sudo nginx -s reload`
34 | 3. push renewed cert to 162.55.109.106 `./push_certs.sh -h 162.55.109.106`
35 |
36 | And, the crontab on the **"secondary"** host looks like this:
37 | ```
38 | 30 0 */7 * * date >> /var/log/farcaster-graph/graphcast_jobs.log ; cd /home/ubuntu/graphcast_jobs; ./install_certs.sh >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1
39 | ```
40 | 1. install cert assuming that graph.cast.k3l.io nginx config already exists and the "primary" server has scp'd over the pem files.
--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/graph.castN.k3l.io:
--------------------------------------------------------------------------------
1 | # Allow listed IP addresses with no rate limits
2 | geo $limit {
3 | default 1;
4 | 10.0.0.0/8 0;
5 | 127.0.0.1/32 0;
6 | 192.168.0.0/24 0;
7 | }
8 |
9 | map $limit $limit_key {
10 | 0 "";
11 | 1 $binary_remote_addr;
12 | }
13 |
14 | # Specify 10 MB storage of binary IP addresses to keep track of 1.6 mil addresses
15 | # to limit at 5 requests/second
16 | limit_req_zone $limit_key zone=graph_castN_zone:10m rate=5r/s;
17 |
18 | server {
19 | server_name graph.castN.k3l.io;
20 |
21 | location ~* \.(env|git|bak|config|log|sh).* {
22 | deny all;
23 | return 404;
24 | }
25 |
26 |
27 | location ~ ^/(_pause|_resume) {
28 | return 404;
29 | }
30 |
31 | location / {
32 | # apply rate limit
33 | limit_req zone=graph_castN_zone burst=10;
34 | proxy_pass http://localhost:8000;
35 | proxy_http_version 1.1;
36 | proxy_set_header Upgrade $http_upgrade;
37 | proxy_set_header Connection 'upgrade';
38 | proxy_set_header Host $host;
39 | proxy_cache_bypass $http_upgrade;
40 | }
41 |
42 | }
43 |
44 | server {
45 | server_name graph.castN.k3l.io;
46 |
47 | location ~* \.(woff|jpg|jpeg|png|gif|ico|css|js)$ {
48 | access_log off;
49 | }
50 |
51 | listen 80;
52 | }
53 |
--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/install_certs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | # Function to log messages with a timestamp
6 | log_message() {
7 | echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
8 | }
9 |
10 | # Source the environment variables from the .env file
11 | if [ -f .env ]; then
12 | source .env
13 | else
14 | log_message "Error: .env file not found."
15 | exit 1
16 | fi
17 |
18 | # Check if CONFIG and WORK_DIR are set
19 | if [ -z "$NGINX_CONFIG" ] || [ -z "$WORK_DIR" ]; then
20 | log_message "Error: CONFIG and WORK_DIR environment variables must be set."
21 | exit 1
22 | fi
23 |
24 | log_message "Starting check_certificates.sh script."
25 |
26 | # Extract the certificate file paths from the Nginx config file
27 | log_message "Extracting certificate file paths from the Nginx config file."
28 | CERT_FILES=$(grep -E 'ssl_certificate|ssl_certificate_key' $NGINX_CONFIG | awk '{print $2}' | tr -d ';')
29 |
30 | # Flag to indicate if any files were moved
31 | FILES_MOVED=false
32 |
33 | # Check and move the files if they exist
34 | for FILE in $CERT_FILES; do
35 | FILE_NAME=$(basename $FILE)
36 | DIR_NAME=$(dirname $FILE)
37 | if [ -f ${WORK_DIR}/${FILE_NAME} ]; then
38 | log_message "Moving ${WORK_DIR}/${FILE_NAME} to $FILE."
39 | sudo mkdir -p $DIR_NAME
40 | sudo mv ${WORK_DIR}/${FILE_NAME} $FILE
41 | FILES_MOVED=true
42 | else
43 | log_message "File ${WORK_DIR}/${FILE_NAME} not found."
44 | fi
45 | done
46 |
47 | # Reload Nginx if any files were moved
48 | if [ "$FILES_MOVED" = true ]; then
49 | log_message "Files moved. Reloading Nginx."
50 | sudo nginx -s reload
51 | else
52 | log_message "No files moved. Nginx reload not required."
53 | fi
54 |
55 | log_message "Script completed."
56 |
--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/push_certs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while getopts h: flag
4 | do
5 | case "${flag}" in
6 | h) REMOTE_HOST=${OPTARG};;
7 | esac
8 | done
9 |
10 | if [ -z "$REMOTE_HOST" ]; then
11 | echo "Usage: $0 -h [remote_host]"
12 | echo ""
13 | echo "Example: $0 -h 37.27.108.188"
14 | echo ""
15 | echo "Params:"
16 | echo " [remote_host] host to which the pem files have to be copied over to"
17 | echo""
18 | exit
19 | fi
20 |
21 |
22 | # Function to log messages with a timestamp
23 | log_message() {
24 | echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
25 | }
26 |
27 | # Source the environment variables from the .env file
28 | if [ -f .env ]; then
29 | log_message "Loading .env file."
30 | source .env
31 | else
32 | log_message "Error: .env file not found."
33 | exit 1
34 | fi
35 |
36 | # Check if NGINX_CONFIG, REMOTE_USER, REMOTE_HOST, and REMOTE_DIR are set
37 | if [ -z "$NGINX_CONFIG" ] || [ -z "$REMOTE_USER" ] || [ -z "$REMOTE_DIR" ] || [ -z "$SSH_PRIV_KEY" ]; then
38 | log_message "Error: NGINX_CONFIG, REMOTE_USER, REMOTE_HOST, REMOTE_DIR and SSH_PRIV_KEY environment variables must be set."
39 | exit 1
40 | fi
41 |
42 | log_message "Starting sync_certificates.sh script."
43 |
44 | # Extract the certificate file paths from the Nginx config file
45 | log_message "Extracting certificate file paths from the Nginx config file."
46 | CERT_FILES=$(grep -E 'ssl_certificate|ssl_certificate_key' $NGINX_CONFIG | awk '{print $2}' | tr -d ';')
47 |
48 | # SCP the certificate files to the remote server
49 | for FILE in $CERT_FILES; do
50 | log_message "Transferring $FILE to ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}."
51 | sudo scp -p -i $SSH_PRIV_KEY $FILE ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}
52 | done
53 |
54 | log_message "Script completed."
55 |
--------------------------------------------------------------------------------
/serve/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .env.sample
3 | scratchpad.md
--------------------------------------------------------------------------------
/serve/.env.sample:
--------------------------------------------------------------------------------
1 | DB_PASSWORD=password
2 | DB_HOST=host
3 | DB_NAME=postgres
4 | DB_USERNAME=postgres
5 | DB_PORT=5432
6 |
7 | GO_EIGENTRUST_URL=http://localhost:8080
8 |
9 | FOLLOW_GRAPH_PATHPREFIX=./samples/fc_following_fid
10 | ENGAGEMENT_GRAPH_PATHPREFIX=./samples/fc_engagement_fid
11 | NINETYDAYS_GRAPH_PATHPREFIX=./samples/fc_90dv3_fid
12 |
13 | # SWAGGER_BASE_URL='CHANGE THIS AND UNCOMMENT'
14 | # CURA_API_KEY='CHANGE THIS AND UNCOMMENT'
15 |
16 | USE_PANDAS_PERF='True or False ?'
17 | # optional overrides
18 | # LOG_LEVEL=INFO
19 | # LOG_LEVEL_CORE='DEBUG'
20 | # LOGURU_FORMAT='{time:YYYY-MM-DD HH:mm:ss} | {module}:{file}:{function}:{line} | {level} | {message}'
21 |
22 | # POSTGRES_POOL_SIZE=5
23 | # POSTGRES_ECHO=False
24 | # POSTGRES_TIMEOUT_SECS=60
25 |
26 | # EIGENTRUST_ALPHA=0.5
27 | # EIGENTRUST_EPSILON=1.0
28 | # EIGENTRUST_MAX_ITER=50
29 | # EIGENTRUST_FLAT_TAIL=2
30 | # GO_EIGENTRUST_TIMEOUT_MS=3000
31 |
32 | # CURA_API_ENDPOINT=https://cura.network/api
33 |
--------------------------------------------------------------------------------
/serve/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/serve/.idea/.name:
--------------------------------------------------------------------------------
1 | farcaster-graph-serve
--------------------------------------------------------------------------------
/serve/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/serve/.idea/dataSources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | postgresql
6 | true
7 | org.postgresql.Driver
8 | jdbc:postgresql://localhost:9541/postgres
9 | $ProjectFileDir$
10 |
11 |
12 |
--------------------------------------------------------------------------------
/serve/.idea/data_source_mapping.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/serve/.idea/farcaster-graph-serve.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/serve/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/serve/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/serve/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/serve/.idea/sqldialects.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/serve/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/serve/.idea/watcherTasks.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/serve/Dockerfile:
--------------------------------------------------------------------------------
1 | # FROM python:3.12-alpine
2 | # not taking the alpine route because packages like psutil don't install without gcc
3 | FROM python:3.12-slim
4 |
5 | RUN pip install --upgrade pip
6 |
7 | RUN pip install poetry
8 |
9 | # single app container
10 | # no need to create virtual envs
11 | # install dependencies into the systems python environment
12 | ENV POETRY_VERSION=1.7.1 \
13 | POETRY_NO_INTERACTION=1 \
14 | POETRY_VIRTUALENVS_CREATE=false
15 |
16 | WORKDIR /code
17 |
18 | COPY pyproject.toml poetry.lock ./
19 | COPY README.md ./
20 |
21 | # we don't want to rebuild all the layers after every app code change
22 | # ignore app code for now
23 | # uncomment the next line if we start using dev/test specific dependencies
24 | # RUN poetry install --without dev,test --no-root
25 | RUN poetry install --no-root
26 |
27 | COPY ./app /code/app
28 | COPY ./static /code/static
29 | COPY .env.docker ./.env
30 |
31 | # install app code, this is the last image layer and has to be rebuilt
32 | # uncomment the next line if we start using dev/test specific dependencies
33 | # RUN poetry install --without dev,test
34 | RUN poetry install --no-root
35 |
36 | EXPOSE 8000
37 |
38 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
39 |
--------------------------------------------------------------------------------
/serve/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/__init__.py
--------------------------------------------------------------------------------
/serve/app/dependencies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/dependencies/__init__.py
--------------------------------------------------------------------------------
/serve/app/dependencies/cache_db_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from asyncpg.pool import Pool
4 | from loguru import logger
5 |
6 |
7 | async def set_homefeed_for_fid(
8 | fid: int, cids: list[str], offset: int, cache_pool: Pool
9 | ):
10 |
11 | session_data = {"api": "homefeed", "cids": cids, "offset": offset}
12 | session_value = json.dumps(session_data)
13 | key = f"session:{fid}"
14 |
15 | # TODO update db using cache_pool
16 | pass
17 |
18 |
19 | async def get_homefeed_for_fid(fid: int, cache_pool: Pool) -> dict:
20 |
21 | key = f"session:{fid}"
22 |
23 | # TODO get cached data from db using cache_pool
24 |
25 | return {"cids": [], "offset": 0}
26 |
--------------------------------------------------------------------------------
/serve/app/dependencies/db_pool.py:
--------------------------------------------------------------------------------
1 | from fastapi import Request
2 |
3 |
4 | # dependency to make it explicit that routers are accessing hidden state
5 | def get_db(request: Request):
6 | return request.state.db_pool
7 |
8 |
9 | def get_cache_db(request: Request):
10 | return request.state.cache_db_pool
11 |
--------------------------------------------------------------------------------
/serve/app/dependencies/logging.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import logging
3 |
4 | from fastapi import Request
5 | from loguru import logger
6 |
7 |
8 | async def get_logger(request: Request):
9 | logger.debug(f"{request.method} {request.url}")
10 | logger.debug("Params:")
11 | for name, value in request.path_params.items():
12 | logger.debug(f"\t{name}: {value}")
13 | logger.debug("Headers:")
14 | for name, value in request.headers.items():
15 | logger.debug(f"\t{name}: {value}")
16 |
17 |
18 | class InterceptHandler(logging.Handler):
19 | """
20 | This intercept allows loguru to work with Python's standard logging module.
21 | https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging
22 | """
23 |
24 | def emit(self, record: logging.LogRecord) -> None:
25 | # Get corresponding Loguru level if it exists.
26 | level: str | int
27 | try:
28 | level = logger.level(record.levelname).name
29 | except ValueError:
30 | level = record.levelno
31 |
32 | # Find caller from where originated the logged message.
33 | frame, depth = inspect.currentframe(), 0
34 | while frame and (depth == 0 or frame.f_code.co_filename == logging.__file__):
35 | frame = frame.f_back
36 | depth += 1
37 |
38 | logger.opt(depth=depth, exception=record.exc_info).log(
39 | level, record.getMessage()
40 | )
41 |
--------------------------------------------------------------------------------
/serve/app/dependencies/memoize_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Tuple
2 |
3 | from memoize.key import KeyExtractor
4 |
5 |
6 | class EncodedMethodNameAndArgsExcludedKeyExtractor(KeyExtractor):
7 | """Encodes method name, args & kwargs to string and uses that as cache entry key.
8 | This KeyExtractor is class-centric and creates same keys for all objects of the same type.
9 | You can exclude args and kwargs by setting 'skip_args' and 'skip_kwargs' flags.
10 |
11 | Note: If wrapped function is a method (has 'self' as first positional arg) you may want to exclude 'self' from key
12 | by setting 'skip_first_arg_as_self' flag.
13 | For static methods of ordinary functions flag should be set to 'False'.
14 |
15 | Warning: uses method name only, so be cautious and do not wrap methods of different classes with the same names
16 | while using same store and 'skip_first_arg_as_self' set to False."""
17 |
18 | def __init__(
19 | self,
20 | skip_first_arg_as_self=False,
21 | skip_args: list[int] = [],
22 | skip_kwargs: list[str] = [],
23 | ) -> None:
24 | self._skip_first_arg_as_self = skip_first_arg_as_self
25 | self._skip_args = skip_args
26 | self._skip_kwargs = skip_kwargs
27 |
28 | def format_key(
29 | self, method_reference, call_args: Tuple[Any, ...], call_kwargs: Dict[str, Any]
30 | ) -> str:
31 | if self._skip_args:
32 | call_args = [
33 | arg for i, arg in enumerate(call_args) if i not in self._skip_args
34 | ]
35 | if self._skip_kwargs:
36 | call_kwargs = {
37 | k: v for k, v in call_kwargs.items() if k not in self._skip_kwargs
38 | }
39 | if self._skip_first_arg_as_self:
40 | call_args.pop(0)
41 |
42 | return str(
43 | (
44 | method_reference.__name__,
45 | call_args,
46 | call_kwargs,
47 | )
48 | )
49 |
50 | def __str__(self) -> str:
51 | return self.__repr__()
52 |
53 | def __repr__(self) -> str:
54 | return (
55 | f"{self.__class__}"
56 | f"[skip_first_arg_as_self={self._skip_first_arg_as_self}]"
57 | f"[skip_args={self._skip_args}]"
58 | )
59 |
--------------------------------------------------------------------------------
/serve/app/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/models/__init__.py
--------------------------------------------------------------------------------
/serve/app/models/channel_model.py:
--------------------------------------------------------------------------------
1 | from enum import Enum, StrEnum
2 |
3 | from pydantic import BaseModel
4 |
5 |
6 | class ChannelRankingsTimeframe(str, Enum):
7 | LIFETIME = 'lifetime'
8 | SIXTY_DAYS = '60d'
9 | SEVEN_DAYS = '7d'
10 | ONE_DAY = '1d'
11 |
12 |
13 | CHANNEL_RANKING_STRATEGY_NAMES = {
14 | ChannelRankingsTimeframe.LIFETIME: 'channel_engagement',
15 | ChannelRankingsTimeframe.SIXTY_DAYS: '60d_engagement',
16 | ChannelRankingsTimeframe.SEVEN_DAYS: '7d_engagement',
17 | ChannelRankingsTimeframe.ONE_DAY: '1d_engagement',
18 | }
19 |
20 |
21 | class OpenrankCategory(StrEnum):
22 | TEST = 'test'
23 | PROD = 'prod'
24 |
25 |
26 | # Deprecated
27 | class ChannelPointsOrderBy(StrEnum):
28 | TOTAL_POINTS = 'total_points'
29 | DAILY_POINTS = 'daily_points'
30 |
31 |
32 | class ChannelEarningsOrderBy(StrEnum):
33 | TOTAL = 'total'
34 | WEEKLY = 'weekly'
35 | DAILY = 'daily'
36 | LATEST = 'latest'
37 |
38 |
39 | class ChannelEarningsScope(StrEnum):
40 | AIRDROP = 'airdrop'
41 | DAILY = 'daily'
42 |
43 |
44 | class ChannelEarningsType(StrEnum):
45 | POINTS = 'points'
46 | TOKENS = 'tokens'
47 |
48 |
49 | class ChannelFidType(StrEnum):
50 | MEMBER = 'member'
51 | FOLLOWER = 'follower'
52 |
--------------------------------------------------------------------------------
/serve/app/models/graph_model.py:
--------------------------------------------------------------------------------
1 | import io
2 | from enum import Enum
3 | from typing import NamedTuple
4 |
5 | import igraph
6 | import pandas
7 |
8 |
9 | class GraphType(Enum):
10 | following = 1
11 | # engagement = 3
12 | # v3engagement = 9
13 | ninetydays = 5
14 |
15 |
16 | class GraphTimeframe(str, Enum):
17 | # lifetime = "lifetime"
18 | ninetydays = "90d"
19 |
20 |
21 | class Graph(NamedTuple):
22 | success_file: str
23 | df: pandas.DataFrame
24 | graph: igraph.Graph
25 | type: GraphType
26 | mtime: float
27 |
28 | def __str__(self):
29 | df_info = io.StringIO()
30 | self.df.info(buf=df_info)
31 | return f"""
32 | type: {self.type}
33 | dataframe: {df_info.getvalue()}
34 | igraph: {self.graph.summary()}
35 | mtime: {self.mtime}
36 | """
37 |
--------------------------------------------------------------------------------
/serve/app/models/score_model.py:
--------------------------------------------------------------------------------
1 | import re
2 | from enum import StrEnum
3 | from typing import NamedTuple, Self
4 |
5 |
6 | class ScoreAgg(StrEnum):
7 | RMS = 'rms'
8 | SUMSQUARE = 'sumsquare'
9 | SUM = 'sum'
10 | SUMCUBEROOT = 'sumcuberoot'
11 |
12 |
13 | class Voting(StrEnum):
14 | SINGLE = 'single'
15 | MULTIPLE = 'multiple'
16 | # TODO
17 | # QUADRATIC = 'quadratic'
18 |
19 |
20 | class QueryType(StrEnum):
21 | SUPERLITE = 'superlite'
22 | LITE = 'lite'
23 | HEAVY = 'heavy'
24 |
25 |
26 | class EngagementType(StrEnum):
27 | V1 = '1.0'
28 | V3 = '2.0'
29 |
30 |
31 | engagement_ids = dict()
32 | engagement_ids[EngagementType.V1] = 3
33 | engagement_ids[EngagementType.V3] = 9
34 |
35 |
36 | class Weights(NamedTuple):
37 | cast: int = 10
38 | recast: int = 5
39 | reply: int = 7
40 | like: int = 1
41 |
42 | @staticmethod
43 | def from_str(weights_str: str) -> Self:
44 | wts = re.search(
45 | r'^([lL](\d{1,2}))?([cC](\d{1,2}))?([rR](\d{1,2}))?([yY](\d{1,2}))?$',
46 | weights_str,
47 | )
48 | if wts is None:
49 | raise Exception("Invalid weights")
50 | return Weights(
51 | like=0 if wts.group(2) is None else wts.group(2),
52 | cast=0 if wts.group(4) is None else wts.group(4),
53 | recast=0 if wts.group(6) is None else wts.group(6),
54 | reply=0 if wts.group(8) is None else wts.group(8),
55 | )
56 |
--------------------------------------------------------------------------------
/serve/app/routers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/routers/__init__.py
--------------------------------------------------------------------------------
/serve/app/routers/token_router.py:
--------------------------------------------------------------------------------
1 | from collections.abc import Sequence
2 | from typing import Self
3 |
4 | from asyncpg import Pool
5 | from eth_typing import ChecksumAddress
6 | from eth_utils import to_bytes, to_checksum_address
7 | from fastapi import APIRouter, Depends, HTTPException, Path, Query
8 | from pydantic import BaseModel, ValidationError, field_validator
9 |
10 | from ..dependencies import db_pool
11 | from ..dependencies.db_utils import get_token_balances
12 |
13 | router = APIRouter(prefix="/{token}", tags=["Token"])
14 |
15 |
16 | class Token(BaseModel):
17 | """
18 | Token address.
19 |
20 | TODO(ek) - expand to CAIP-19, to add chain ID and stuff.
21 | """
22 |
23 | address: ChecksumAddress
24 |
25 | @field_validator("address", mode="before")
26 | def ensure_address(cls, v):
27 | try:
28 | return to_checksum_address(v)
29 | except Exception:
30 | raise ValueError(f"Invalid token address: {v!r}")
31 |
32 | @classmethod
33 | def from_str(cls, v: str) -> Self:
34 | return cls(address=to_checksum_address(v))
35 |
36 |
37 | def get_token(token: str = Path(description="ERC20 token address")) -> Token:
38 | try:
39 | return Token.from_str(token)
40 | except ValidationError as e:
41 | raise HTTPException(status_code=422, detail=f"Invalid token {token!r}")
42 |
43 |
44 | @router.get("/balances")
45 | async def get_balances(
46 | token: Token = Depends(get_token),
47 | fids: Sequence[int] = Query(..., alias='fid', min_items=1),
48 | pool: Pool = Depends(db_pool.get_db),
49 | ):
50 | rows = await get_token_balances(to_bytes(hexstr=token.address), fids, pool)
51 | balances = {fid: value for fid, value in rows}
52 | return {
53 | "balances": [
54 | {"fid": fid, "value": str(int(balances.get(fid, 0)))} for fid in fids
55 | ]
56 | }
57 |
--------------------------------------------------------------------------------
/serve/app/routers/user_router.py:
--------------------------------------------------------------------------------
1 | from typing import Annotated, List, Optional
2 |
3 | from asyncpg.pool import Pool
4 | from fastapi import APIRouter, Depends, Header, Query
5 | from loguru import logger
6 |
7 | from ..dependencies import db_pool, db_utils, graph
8 |
9 | router = APIRouter(tags=["User Labels (Requires API Key)"])
10 |
11 |
12 | @router.get("/labels/global/top_casters")
13 | async def get_top_global_casters(
14 | x_api_key: Optional[str] = Header(None), # used only for swagger ui
15 | offset: Annotated[int | None, Query()] = 0,
16 | limit: Annotated[int | None, Query(le=1000)] = 100,
17 | pool: Pool = Depends(db_pool.get_db),
18 | ):
19 | """
20 | Get the top global casters
21 | This API takes optional parameters -
22 | offset and limit
23 | Parameter 'offset' is used to specify how many results to skip
24 | and can be useful for paginating through results. \n
25 | Parameter 'limit' is used to specify the number of results to return. \n
26 | Header 'x-api-key' is used to authenticate the user. Please contact hello@karma3labs.com or https://t.me/Karma3Labs to get the trial API key. \n
27 | """
28 |
29 | top_casters = await db_utils.get_top_casters(offset=offset, limit=limit, pool=pool)
30 | return {"result": top_casters}
31 |
32 |
33 | @router.get("/labels/global/top_spammers")
34 | async def get_top_global_spammers(
35 | x_api_key: Optional[str] = Header(None), # used only for swagger ui
36 | offset: Annotated[int | None, Query()] = 0,
37 | limit: Annotated[int | None, Query(le=1000)] = 100,
38 | pool: Pool = Depends(db_pool.get_db),
39 | ):
40 | """
41 | Get the top global spammers
42 | This API takes optional parameters -
43 | offset and limit
44 | Parameter 'offset' is used to specify how many results to skip
45 | and can be useful for paginating through results. \n
46 | Parameter 'limit' is used to specify the number of results to return. \n
47 | Header 'x-api-key' is used to authenticate the user. Please contact hello@karma3labs.com or https://t.me/Karma3Labs to get the trial API key. \n
48 | """
49 |
50 | top_spammers = await db_utils.get_top_spammers(
51 | offset=offset, limit=limit, pool=pool
52 | )
53 | return {"result": top_spammers}
54 |
--------------------------------------------------------------------------------
/serve/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | farcaster-graph:
3 | build: .
4 | container_name: farcaster-graph
5 | image: farcaster-graph:latest
6 | volumes:
7 | - /home/ubuntu/serve_files:/tmp
8 | environment:
9 | PORT: 8000
10 | ports:
11 | - '8000:8000'
12 | deploy:
13 | resources:
14 | limits:
15 | memory: 64G
16 | restart: unless-stopped
17 | extra_hosts:
18 | - "host.docker.internal:host-gateway"
19 | networks:
20 | - farcaster-network
21 |
22 | networks:
23 | farcaster-network:
24 | name: farcaster-network
25 | external: true
--------------------------------------------------------------------------------
/serve/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "serve"
3 | version = "0.1.0"
4 | description = ""
5 | authors = ["Vijay Mariadassou "]
6 | readme = "README.md"
7 |
8 | [tool.poetry.dependencies]
9 | python = "^3.12"
10 | fastapi = "^0.109.0"
11 | uvicorn = "^0.27.0"
12 | asyncpg = "^0.29.0"
13 | sqlalchemy = "^2.0.25"
14 | loguru = "^0.7.2"
15 | igraph = "^0.11.3"
16 | pydantic-settings = "^2.1.0"
17 | psutil = "^5.9.8"
18 | pandas = {extras = ["performance"], version = "^2.2.2"}
19 | numpy = "^1.26.4"
20 | requests = "^2.31.0"
21 | opentelemetry-distro = "0.43b0"
22 | opentelemetry-instrumentation-fastapi = "0.43b0"
23 | opentelemetry-instrumentation-logging = "0.43b0"
24 | opentelemetry-exporter-otlp = "1.22.0"
25 | prometheus-client = "0.19.0"
26 | asgi-correlation-id = "^4.3.1"
27 | niquests = "^3.14.0"
28 | py-memoize = "^3.1.1"
29 | black = "^25.1.0"
30 | async-lru = "^2.0.5"
31 | isort = "^6.0.1"
32 | eth-typing = "^5.2.1"
33 | eth-utils = "^5.3.0"
34 | eth-hash = {extras = ["pycryptodome"], version = "^0.7.1"}
35 | cashews = {extras = ["diskcache"], version = "^7.4.0"}
36 |
37 | [build-system]
38 | requires = ["poetry-core"]
39 | build-backend = "poetry.core.masonry.api"
40 |
41 | [project]
42 | name = "serve"
43 | version = "0.1.0"
44 | requires-python = ">=3.12"
45 |
46 | [tool.black]
47 | skip-string-normalization = true
48 |
--------------------------------------------------------------------------------
/serve/samples/fc_90dv3_fid_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_SUCCESS
--------------------------------------------------------------------------------
/serve/samples/fc_90dv3_fid_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_df.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_90dv3_fid_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_ig.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_SUCCESS
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_df.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_fid_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_SUCCESS
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_fid_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_df.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_fid_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_ig.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_idx.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_idx.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_engagement_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_ig.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_following_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_SUCCESS
--------------------------------------------------------------------------------
/serve/samples/fc_following_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_df.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_following_fid_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_SUCCESS
--------------------------------------------------------------------------------
/serve/samples/fc_following_fid_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_df.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_following_fid_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_ig.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_following_idx.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_idx.pkl
--------------------------------------------------------------------------------
/serve/samples/fc_following_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_ig.pkl
--------------------------------------------------------------------------------
/serve/samples/personal_graph.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/personal_graph.parquet
--------------------------------------------------------------------------------
/serve/scripts/lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | unset -v tooldir opt
3 | OPTIND=1
4 | while getopts :b: opt
5 | do
6 | case "${opt}" in
7 | '?') echo "unrecognized option -${OPTARG}" >&2; exit 64;;
8 | ':') echo "missing argument for -${OPTARG}" >&2; exit 64;;
9 | b) tooldir="${OPTARG}";;
10 | *) echo "unhandled option -${opt}" >&2; exit 70;;
11 | esac
12 | done
13 | shift $((OPTIND - 1))
14 | case "${tooldir+set}" in
15 | set) PATH="${tooldir}${PATH+":${PATH}"}"; export PATH;;
16 | esac
17 | case $# in
18 | 0)
19 | set -- .
20 | ;;
21 | esac
22 | isort --profile=black "$@" || exit
23 | black --quiet "$@" || exit
24 | #autopep8 --in-place --aggressive --aggressive --recursive "$@" || exit
25 |
--------------------------------------------------------------------------------
/serve/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/static/favicon.png
--------------------------------------------------------------------------------
/sql/counts_by_day.sql:
--------------------------------------------------------------------------------
1 | WITH casts_counts AS (
2 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS casts_count
3 | FROM casts
4 | GROUP BY DATE_TRUNC('day', timestamp)
5 | ),
6 | links_counts AS (
7 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS links_count
8 | FROM links
9 | GROUP BY DATE_TRUNC('day', timestamp)
10 | ),
11 | messages_counts AS (
12 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS messages_count
13 | FROM messages
14 | GROUP BY DATE_TRUNC('day', timestamp)
15 | ),
16 | reactions_counts AS (
17 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS reactions_count
18 | FROM reactions
19 | GROUP BY DATE_TRUNC('day', timestamp)
20 | ),
21 | user_data_counts AS (
22 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS user_data_count
23 | FROM user_data
24 | GROUP BY DATE_TRUNC('day', timestamp)
25 | ),
26 | verifications_counts AS (
27 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS verifications_count
28 | FROM verifications
29 | GROUP BY DATE_TRUNC('day', timestamp)
30 | )
31 | SELECT
32 | COALESCE(casts.day, links.day, messages.day, reactions.day, user_data.day, verifications.day) AS day,
33 | COALESCE(casts_count, 0) AS casts_count,
34 | COALESCE(links_count, 0) AS links_count,
35 | COALESCE(messages_count, 0) AS messages_count,
36 | COALESCE(reactions_count, 0) AS reactions_count,
37 | COALESCE(user_data_count, 0) AS user_data_count,
38 | COALESCE(verifications_count, 0) AS verifications_count
39 | FROM casts_counts casts
40 | FULL OUTER JOIN links_counts links ON casts.day = links.day
41 | FULL OUTER JOIN reactions_counts reactions ON COALESCE(casts.day, links.day) = reactions.day
42 | FULL OUTER JOIN verifications_counts verifications ON COALESCE(casts.day, links.day, reactions.day) = verifications.day
43 | FULL OUTER JOIN messages_counts messages ON COALESCE(casts.day, links.day, reactions.day, verifications.day) = messages.day
44 | FULL OUTER JOIN user_data_counts user_data ON COALESCE(casts.day, links.day, reactions.day, verifications.day, messages.day) = user_data.day
45 | ORDER BY day DESC
46 | LIMIT 1000;
47 |
--------------------------------------------------------------------------------
/sql/counts_by_table.sql:
--------------------------------------------------------------------------------
1 | WITH
2 | q_casts AS (SELECT COUNT(1) AS casts FROM casts),
3 | q_chain_events AS (SELECT COUNT(1) AS chain_events FROM chain_events),
4 | q_fids AS (SELECT COUNT(1) AS fids FROM fids),
5 | q_fnames AS (SELECT COUNT(1) AS fnames FROM fnames),
6 | q_links AS (SELECT COUNT(1) AS links FROM links),
7 | q_messages AS (SELECT COUNT(1) AS messages FROM messages),
8 | q_reactions AS (SELECT COUNT(1) AS reactions FROM reactions),
9 | q_signers AS (SELECT COUNT(1) AS signers FROM signers),
10 | q_storage_alloc AS (SELECT COUNT(1) AS storage_alloc FROM storage_allocations),
11 | q_user_data AS (SELECT COUNT(1) AS user_data FROM user_data),
12 | q_username_proofs AS (SELECT COUNT(1) AS username_proofs FROM username_proofs),
13 | q_verifications AS (SELECT COUNT(1) AS verifications FROM verifications)
14 |
15 | SELECT
16 | q_casts.casts,
17 | q_chain_events.chain_events,
18 | q_fids.fids,
19 | q_fnames.fnames,
20 | q_links.links,
21 | q_messages.messages,
22 | q_reactions.reactions,
23 | q_signers.signers,
24 | q_storage_alloc.storage_alloc,
25 | q_user_data.user_data,
26 | q_username_proofs.username_proofs,
27 | q_verifications.verifications
28 | FROM
29 | q_casts,
30 | q_chain_events,
31 | q_fids,
32 | q_fnames,
33 | q_links,
34 | q_messages,
35 | q_reactions,
36 | q_signers,
37 | q_storage_alloc,
38 | q_user_data,
39 | q_username_proofs,
40 | q_verifications;
41 |
--------------------------------------------------------------------------------
/sql/neynar-replica/.env.sample:
--------------------------------------------------------------------------------
1 | POSTGRES_HOST=127.0.0.1
2 | POSTGRES_PORT=9541
3 | POSTGRES_USER=postgres
4 | POSTGRES_NAME=postgres
5 | POSTGRES_PASSWORD=CHANGEME
6 | PRIMARY_HOST=135.181.236.185
7 | PRIMARY_PORT=9541
8 | PRIMARY_USER=replica_user
9 | PRIMARY_PASSWORD=CHANGEME
10 | PRIMARY_SLOT_NAME=eigen10
11 | PGDATA=/var/lib/postgresql/data
12 | GID=999
13 | UID=
14 | HOST_VOLUME=/data/pgdata
--------------------------------------------------------------------------------
/sql/neynar-replica/Dockerfile:
--------------------------------------------------------------------------------
1 | Dockerfile.noble
--------------------------------------------------------------------------------
/sql/neynar-replica/Dockerfile.alpine:
--------------------------------------------------------------------------------
1 | FROM postgres:17.2-alpine
2 |
3 | # Install sudo and configure it for passwordless operation
4 | RUN apk add --no-cache sudo && \
5 | echo "postgres ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/postgres
6 |
7 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh
8 | RUN chmod +x /usr/local/bin/entrypoint.sh
9 |
10 | # Set the entrypoint script
11 | ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
12 |
--------------------------------------------------------------------------------
/sql/neynar-replica/Dockerfile.noble:
--------------------------------------------------------------------------------
1 | # Use a base image with glibc 2.39
2 | FROM ubuntu:noble
3 |
4 | # Install necessary packages
5 | RUN apt-get update && \
6 | apt-get install -y sudo curl gnupg lsb-release && \
7 | apt-get clean && \
8 | rm -rf /var/lib/apt/lists/*;
9 |
10 | # Install locales
11 | RUN apt-get update && \
12 | apt-get install -y --no-install-recommends locales && \
13 | rm -rf /var/lib/apt/lists/*;
14 | RUN echo 'en_US.UTF-8 UTF-8' >> /etc/locale.gen; \
15 | locale-gen; \
16 | locale -a | grep 'en_US.utf8'
17 |
18 | # Add PostgreSQL repository
19 | RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
20 | curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
21 |
22 | ARG GID
23 | ARG UID
24 | # Force postgres installation to use uid=999 and gid=999
25 | RUN set -eux; \
26 | groupadd -r postgres --gid=${GID}; \
27 | useradd -r -g postgres --uid=${UID} --home-dir=/var/lib/postgresql --shell=/bin/bash postgres;
28 |
29 | # Install PostgreSQL
30 | RUN apt-get update
31 | RUN apt-get install -y postgresql-17
32 | RUN apt-get clean && \
33 | rm -rf /var/lib/apt/lists/*
34 |
35 | # Set up sudo for postgres user
36 | RUN echo "postgres ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/postgres && \
37 | chmod 440 /etc/sudoers.d/postgres
38 |
39 | ENV PG_MAJOR=17
40 | ENV PATH=$PATH:/usr/lib/postgresql/$PG_MAJOR/bin
41 |
42 | RUN echo 'Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/17/bin"' \
43 | >> /etc/sudoers.d/postgres
44 |
45 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh
46 | RUN chmod +x /usr/local/bin/entrypoint.sh
47 |
48 | # Set the entrypoint script
49 | ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
50 |
--------------------------------------------------------------------------------
/sql/neynar-replica/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | postgres:
3 | restart: unless-stopped
4 | container_name: eigen8-replica-postgres
5 | shm_size: '32gb'
6 | build:
7 | context: .
8 | args:
9 | GID: ${GID}
10 | UID: ${UID}
11 | ports:
12 | - '${POSTGRES_PORT}:5432'
13 | environment:
14 | POSTGRES_DB: ${POSTGRES_NAME}
15 | POSTGRES_USER: ${POSTGRES_USER}
16 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
17 | PRIMARY_HOST: ${PRIMARY_HOST}
18 | PRIMARY_PORT: ${PRIMARY_PORT}
19 | PRIMARY_USER: ${PRIMARY_USER}
20 | PRIMARY_PASSWORD: ${PRIMARY_PASSWORD}
21 | PRIMARY_SLOT_NAME: ${PRIMARY_SLOT_NAME}
22 | PGDATA: ${PGDATA}
23 | volumes:
24 | - ${HOST_VOLUME}:/var/lib/postgresql/data
25 | - ${PWD}/postgresql.conf:/usr/local/bin/postgresql.conf
26 | - ${PWD}/pg_hba.conf:/usr/local/bin/pg_hba.conf
27 | healthcheck:
28 | test: ['CMD-SHELL', 'pg_isready --dbname=${POSTGRES_NAME} -U ${PRIMARY_USER}']
29 | interval: 10s
30 | timeout: 10s
31 | retries: 3
32 | networks:
33 | - farcaster-network
34 |
35 | networks:
36 | farcaster-network:
37 | external: true
38 | name: farcaster-network
39 |
40 | volumes:
41 | postgres-data:
42 | name: neynar-replica
43 |
--------------------------------------------------------------------------------
/sql/neynar-replica/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # Directory where the data will be stored
5 | DATA_DIR="/var/lib/postgresql/data"
6 | CONF_DIR="/var/lib/postgresql/conf"
7 |
8 | # Ensure environment variables are set
9 | if [ -z "$PRIMARY_HOST" ] || [ -z "$PRIMARY_PORT" ] || [ -z "$PRIMARY_USER" ] || [ -z "$PRIMARY_PASSWORD" ] || [ -z "$PRIMARY_SLOT_NAME" ]; then
10 | echo "Error: Environment variables not set correctly."
11 | exit 1
12 | fi
13 |
14 | # Prepare configuration directory (outside of data directory)
15 | mkdir -p $CONF_DIR
16 | cp /usr/local/bin/postgresql.conf $CONF_DIR/postgresql.conf
17 | cp /usr/local/bin/pg_hba.conf $CONF_DIR/pg_hba.conf
18 |
19 | # Check if the data directory is empty
20 | if [ "$(ls -A $DATA_DIR)" ]; then
21 | echo "Data directory is not empty."
22 | else
23 | echo "Data directory is empty, setting up .pgpass file..."
24 | echo "$PRIMARY_HOST:$PRIMARY_PORT:*:$PRIMARY_USER:$PRIMARY_PASSWORD" > /root/.pgpass
25 | chmod 600 /root/.pgpass
26 |
27 | echo "Initiating base backup..."
28 | pg_config --version
29 | pg_basebackup -h $PRIMARY_HOST -p $PRIMARY_PORT -D $DATA_DIR -U $PRIMARY_USER -vP -w -Xs -R -S $PRIMARY_SLOT_NAME
30 |
31 | # Set the correct permissions
32 | chmod 0700 $DATA_DIR
33 | chown -R postgres:postgres $DATA_DIR
34 |
35 | # Move the customized postgresql.conf back to the data directory
36 | mv $CONF_DIR/postgresql.conf $DATA_DIR/postgresql.conf
37 | mv $CONF_DIR/pg_hba.conf $DATA_DIR/pg_hba.conf
38 |
39 | echo "Backup and configuration complete. Starting PostgreSQL in standby mode."
40 | fi
41 |
42 |
43 | # Start PostgreSQL using sudo
44 | exec sudo -u postgres postgres -D $DATA_DIR
45 |
--------------------------------------------------------------------------------
/sql/neynar-replica/postgresql.conf:
--------------------------------------------------------------------------------
1 | listen_addresses = '*' # what IP address(es) to listen on;
2 | port = 5432 # (change requires restart)
3 | max_connections = 400 # (change requires restart)
4 | shared_buffers = 8GB # min 128kB
5 | work_mem = 64MB # min 64kB
6 | maintenance_work_mem = 1GB # min 64kB
7 | dynamic_shared_memory_type = posix # the default is usually the first option
8 | max_worker_processes = 16 # (change requires restart)
9 | wal_level = replica # minimal, replica, or logical
10 | synchronous_commit = local # synchronization level;
11 | wal_log_hints = on # also do full page writes of non-critical updates
12 | wal_compression = on # enables compression of full-page writes;
13 | checkpoint_timeout = 60min # range 30s-1d
14 | max_wal_size = 16GB
15 | min_wal_size = 80MB
16 | max_wal_senders = 10 # max number of walsender processes
17 | hot_standby = on # "off" disallows queries during recovery
18 | wal_receiver_timeout = 5min # time that receiver waits for
19 | random_page_cost = 1.1 # same scale as above
20 | effective_cache_size = 16GB
21 | log_line_prefix = '%m [%p] %q%u@%d ' # special values:
22 | log_timezone = UTC
23 | cluster_name = '17/main' # added to process titles if nonempty
24 | default_transaction_read_only = on
25 | datestyle = 'iso, mdy'
26 | timezone = UTC
27 | shared_preload_libraries = 'pg_stat_statements' # (change requires restart)
28 |
--------------------------------------------------------------------------------
/sql/replicator_drop_fk.sql:
--------------------------------------------------------------------------------
1 | ALTER TABLE IF EXISTS ONLY public.verifications DROP CONSTRAINT IF EXISTS verifications_hash_foreign;
2 | ALTER TABLE IF EXISTS ONLY public.verifications DROP CONSTRAINT IF EXISTS verifications_fid_foreign;
3 | ALTER TABLE IF EXISTS ONLY public.username_proofs DROP CONSTRAINT IF EXISTS username_proofs_fid_foreign;
4 | ALTER TABLE IF EXISTS ONLY public.user_data DROP CONSTRAINT IF EXISTS user_data_hash_foreign;
5 | ALTER TABLE IF EXISTS ONLY public.user_data DROP CONSTRAINT IF EXISTS user_data_fid_foreign;
6 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_requester_fid_foreign;
7 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_remove_chain_event_id_foreign;
8 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_fid_foreign;
9 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_add_chain_event_id_foreign;
10 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_target_hash_foreign;
11 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_hash_foreign;
12 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_fid_foreign;
13 | ALTER TABLE IF EXISTS ONLY public.messages DROP CONSTRAINT IF EXISTS messages_signer_fid_foreign;
14 | ALTER TABLE IF EXISTS ONLY public.messages DROP CONSTRAINT IF EXISTS messages_fid_foreign;
15 | ALTER TABLE IF EXISTS ONLY public.links DROP CONSTRAINT IF EXISTS links_target_fid_foreign;
16 | ALTER TABLE IF EXISTS ONLY public.links DROP CONSTRAINT IF EXISTS links_fid_foreign;
17 | ALTER TABLE IF EXISTS ONLY public.fnames DROP CONSTRAINT IF EXISTS fnames_fid_foreign;
18 | ALTER TABLE IF EXISTS ONLY public.storage_allocations DROP CONSTRAINT IF EXISTS fids_chain_event_id_foreign;
19 | ALTER TABLE IF EXISTS ONLY public.fids DROP CONSTRAINT IF EXISTS fids_chain_event_id_foreign;
20 | ALTER TABLE IF EXISTS ONLY public.casts DROP CONSTRAINT IF EXISTS casts_hash_foreign;
21 | ALTER TABLE IF EXISTS ONLY public.casts DROP CONSTRAINT IF EXISTS casts_fid_foreign;
22 |
--------------------------------------------------------------------------------