├── .gitignore ├── README.md ├── notebooks ├── Compare_APIs.ipynb ├── Compare_Timing_Logs.ipynb ├── GenGlobalTrust_indexed.ipynb ├── GenLocalTrust.ipynb ├── GenPersonalGraph.ipynb ├── README.md ├── debug_prod_graph.ipynb ├── igraph-engagement_addr.ipynb ├── igraph-engagement_fid.ipynb ├── requirements.txt └── scripts_export │ └── GenPersonalGraph.py ├── pipeline ├── .env.sample ├── Dockerfile ├── README.md ├── casts │ ├── __init__.py │ ├── cast_db_utils.py │ ├── main.py │ ├── main_fetch_top_casters.py │ └── main_fetch_top_spammers.py ├── channels │ ├── Bot_Fids.csv │ ├── Seed_Fids.csv │ ├── Top_Channels.csv │ ├── Trending_Channels.csv │ ├── __init__.py │ ├── channel_db_utils.py │ ├── channel_queries.py │ ├── channel_utils.py │ ├── main.py │ ├── main_channel_rank.py │ ├── main_fetch_channel_top_casters.py │ ├── main_metrics.py │ ├── main_notify_daily_trending.py │ ├── main_notify_leaderboard.py │ ├── main_notify_weekly_mods.py │ ├── main_openrank.py │ ├── main_points.py │ ├── main_tokens.py │ └── openrank_utils.py ├── config.py ├── crontab.txt ├── cura_utils.py ├── dag_utils │ ├── clear_task_instance.py │ ├── combine_csv.py │ └── dune_backup.py ├── dags │ ├── archived │ │ ├── dag_automod.py │ │ ├── dag_backup_sandbox_db.py │ │ ├── dag_copy_graph_files_to_sandbox_dev_v1.py │ │ ├── dag_degen_tips_processing.py │ │ ├── dag_gen_personal_graph_replica_v0.py │ │ ├── dag_insert_degen_ranking_v0.py │ │ ├── dag_monitor_sandbox.py │ │ ├── dag_run_frame_pipeline_v0.py │ │ ├── degen │ │ │ ├── calculate_rank.py │ │ │ └── create_degen_sql_functions.py │ │ ├── extractors │ │ │ ├── dag_warpcast_channel_followers.py │ │ │ ├── dag_warpcast_channel_members.py │ │ │ └── dag_warpcast_channels.py │ │ └── sandbox │ │ │ ├── dag_sync_sandbox_casts.py │ │ │ ├── dag_sync_sandbox_channel_fids.py │ │ │ ├── dag_sync_sandbox_db_dev.py │ │ │ ├── dag_sync_sandbox_globaltrust.py │ │ │ └── dag_sync_sandbox_labels.py │ ├── cura │ │ ├── dag_direct_cast_join_requests.py │ │ ├── dag_run_autoinvite_rules.py │ │ └── dag_run_quote_casts.py │ ├── dag_backup_to_s3_v1.py │ ├── dag_copy_graph_files_to_replicas_v1.py │ ├── dag_gen_channel_openrank.py │ ├── dag_gen_channel_ranking_v3.py │ ├── dag_gen_channel_ranking_v4.py │ ├── dag_gen_globaltrust_v1.py │ ├── dag_gen_personal_graph_replica_v1.py │ ├── dag_notify_channel_daily_trending.py │ ├── dag_notify_channel_leaderboard.py │ ├── dag_notify_channel_weekly_mods.py │ ├── dag_refresh_rank_view_v0.py │ ├── dag_run_cast_pipeline_v0.py │ ├── dag_update_channel_points.py │ ├── dag_update_channel_tokens.py │ ├── extractors │ │ └── dag_cura_mod.py │ ├── monitoring │ │ ├── __init__.py │ │ ├── dag_monitor_nindexer.py │ │ └── dag_monitor_replication.py │ ├── one_off │ │ ├── .placeholder │ │ ├── dag_gen_globaltrust_by_date_v0.py │ │ ├── dag_gen_globaltrust_by_date_v1.py │ │ ├── dag_insert_to_dune_table.py │ │ ├── dag_migrate_dune_table.py │ │ ├── dag_trial_branch.py │ │ ├── dag_trial_sql.py │ │ ├── dag_trial_task_groups.py │ │ └── dag_trial_trigger.py │ ├── pg_to_dune │ │ ├── .env.sample │ │ ├── app │ │ │ └── check_last_timestamp.py │ │ └── upload_to_dune.sh │ ├── reports │ │ ├── dag_gen_channel_metrics.py │ │ └── dag_gen_labels.py │ └── triggers │ │ ├── trigger_gen_channel_ranking_v3.py │ │ └── trigger_gen_channel_ranking_v4.py ├── db_utils.py ├── docker-compose.yaml ├── extractors │ ├── automod_extractor.py │ ├── channel_extractor_utils.py │ ├── cura_mod_extractor.py │ ├── extract_channel_data.sh │ ├── extract_channel_fids.sh │ ├── extract_cura_mod.sh │ ├── main_channel_data.py │ └── main_channel_fids.py ├── frames │ ├── __init__.py │ ├── frames_db_utils.py │ ├── incremental_load_cast_mapping.sql │ ├── incremental_load_labels.sql │ ├── main.py │ ├── scrape_utils.py │ └── test_urls.py ├── globaltrust │ ├── __init__.py │ ├── compute.py │ ├── export_localtrust_daily_stats.sql │ ├── gen_globaltrust.py │ ├── queries.py │ └── test_data.py ├── go_eigentrust.py ├── graph │ ├── __init__.py │ ├── export_existingConnections_addr.sql │ ├── export_existingConnections_fid.sql │ ├── export_l1rep6rec3m12enhancedConnections_addr.sql │ ├── export_l1rep6rec3m12enhancedConnections_fid.sql │ ├── fetch_nodes_edges.py │ ├── gen_igraph.py │ ├── gen_personal_graph_amp.py │ ├── gen_personal_graph_amp_v1.py │ ├── graph_utils.py │ ├── rechunk_graph_pqt.py │ └── serve_igraph.py ├── igraph-docker-compose.yml ├── igraph.Dockerfile ├── igraph.nginx.conf ├── logs │ └── .placeholder ├── plugins │ ├── .placeholder │ ├── __init__.py │ └── hooks │ │ ├── __init__.py │ │ ├── common.py │ │ ├── discord.py │ │ └── pagerduty.py ├── requirements.txt ├── run_cast_pipeline.sh ├── run_channel_metrics.sh ├── run_channel_openrank.sh ├── run_channel_scraper_v3.sh ├── run_channel_scraper_v4.sh ├── run_download_pqt_files_v1.sh ├── run_eigen2_postgres_sql.sh ├── run_eigen8_postgres_sql.sh ├── run_fetch_channel_top_caster.sh ├── run_fetch_top_caster.sh ├── run_fetch_top_spammers.sh ├── run_frame_scraper.sh ├── run_globaltrust_pipeline.sh ├── run_graph_pipeline.sh ├── run_notify_channel_daily_trending.sh ├── run_notify_channel_leaderboard.sh ├── run_notify_channel_weekly_mods.sh ├── run_personal_graph_pipeline_v1.sh ├── run_update_channel_points.sh ├── run_update_channel_tokens.sh ├── samples │ ├── localtrust-engagement.csv │ ├── localtrust-following.csv │ └── pretrust.csv ├── schema │ ├── globaltrust_config.sql │ ├── k3l_objects.sql │ ├── k3l_schema.sql │ ├── neynar_db_schema.sql │ ├── pretrust_v2.sql │ └── replicator_db_schema.sql ├── scripts │ ├── archived │ │ ├── run_create_degen_db_functions.sh │ │ ├── run_personal_graph_pipeline.sh │ │ ├── run_sandbox_backup.sh │ │ └── run_urlextract_pipeline.sh │ └── one_off │ │ ├── diff_db_table.py │ │ ├── diff_json_api.py │ │ └── run_cast_pipeline_gapfills.sh ├── sshtunnel.Dockerfile ├── timer.py ├── tmp │ └── .placeholder └── utils.py ├── scripts ├── .placeholder └── certs │ └── graphcast_jobs │ ├── .env.sample │ ├── README.md │ ├── graph.cast.k3l.io │ ├── graph.castN.k3l.io │ ├── install_certs.sh │ └── push_certs.sh ├── serve ├── .dockerignore ├── .env.sample ├── .gitignore ├── .idea │ ├── .gitignore │ ├── .name │ ├── codeStyles │ │ └── codeStyleConfig.xml │ ├── dataSources.xml │ ├── data_source_mapping.xml │ ├── farcaster-graph-serve.iml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── sqldialects.xml │ ├── vcs.xml │ └── watcherTasks.xml ├── Dockerfile ├── README.md ├── app │ ├── __init__.py │ ├── config.py │ ├── dependencies │ │ ├── __init__.py │ │ ├── cache_db_utils.py │ │ ├── db_pool.py │ │ ├── db_utils.py │ │ ├── graph.py │ │ ├── logging.py │ │ └── memoize_utils.py │ ├── graph_loader.py │ ├── main.py │ ├── models │ │ ├── __init__.py │ │ ├── channel_model.py │ │ ├── feed_model.py │ │ ├── graph_model.py │ │ └── score_model.py │ ├── routers │ │ ├── __init__.py │ │ ├── cast_router.py │ │ ├── channel_router.py │ │ ├── direct_router.py │ │ ├── frame_router.py │ │ ├── globaltrust_router.py │ │ ├── graph_router.py │ │ ├── localtrust_router.py │ │ ├── metadata_router.py │ │ ├── token_router.py │ │ └── user_router.py │ ├── telemetry.py │ └── utils.py ├── docker-compose.yml ├── poetry.lock ├── pyproject.toml ├── samples │ ├── fc_90dv3_fid_SUCCESS │ ├── fc_90dv3_fid_df.pkl │ ├── fc_90dv3_fid_ig.pkl │ ├── fc_engagement_SUCCESS │ ├── fc_engagement_df.pkl │ ├── fc_engagement_fid_SUCCESS │ ├── fc_engagement_fid_df.pkl │ ├── fc_engagement_fid_ig.pkl │ ├── fc_engagement_idx.pkl │ ├── fc_engagement_ig.pkl │ ├── fc_following_SUCCESS │ ├── fc_following_df.pkl │ ├── fc_following_fid_SUCCESS │ ├── fc_following_fid_df.pkl │ ├── fc_following_fid_ig.pkl │ ├── fc_following_idx.pkl │ ├── fc_following_ig.pkl │ ├── fid_scores.json │ ├── lt_existingConnections_addr.csv │ ├── lt_existingConnections_fid.csv │ ├── lt_fboostedl1rep3rec6m12_90d_fid.csv │ ├── lt_l1rep6rec3m12enhancedConnections_addr.csv │ ├── lt_l1rep6rec3m12enhancedConnections_fid.csv │ └── personal_graph.parquet ├── scratchpad.md ├── scripts │ └── lint.sh └── static │ └── favicon.png └── sql ├── counts_by_day.sql ├── counts_by_table.sql ├── k3l_requirements.sql ├── neynar-replica ├── .env.sample ├── Dockerfile ├── Dockerfile.alpine ├── Dockerfile.noble ├── docker-compose.yml ├── entrypoint.sh ├── pg_hba.conf ├── postgresql.conf └── postgresql.conf.orig ├── replicator_drop_fk.sql └── replicator_schema.sql /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | dist/ 3 | .env 4 | .env.docker 5 | .*.credentials.json 6 | build/ 7 | .venv 8 | *.pyc 9 | **/.ipynb_checkpoints 10 | **/.DS_Store 11 | **/lib/ 12 | notebooks/data/ 13 | **/pg_to_dune/csv 14 | pipeline/logs 15 | **/.vscode 16 | certificates 17 | **/tmp 18 | # Vim swap files 19 | .*.sw? 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extract Graph-based insights from Farcaster 2 | The project is broken into three sub-projects: 3 | 4 | 1. `notebooks` - Jupyter notebooks for data exploration and prototyping graph queries. 5 | 2. `pipeline` - python scripts to generate graphs and dataframes that can be used to serve graph-based queries. 6 | 3. `serve` - FastAPI server to serve API requests for querying the graph from Farcaster. 7 | 8 | __NOTE__ For details on how to deploy an individual sub-project, check out the Readme docs under that sub-project. 9 | 10 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Pre-requisites 2 | Assuming that you have Python and [pip](https://pip.pypa.io/en/stable/) installed on your system (maybe in a [virtualenv](https://docs.python.org/3/library/venv.html)), you need to `pip install -r requirements.txt` 3 | 4 | # Exploring the Notebooks 5 | Run `jupyter notebook` and explore the notebooks in your default browser. -------------------------------------------------------------------------------- /notebooks/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | pandas 3 | igraph 4 | niquests 5 | ipython 6 | -------------------------------------------------------------------------------- /pipeline/.env.sample: -------------------------------------------------------------------------------- 1 | DB_HOST="ip.address.or.host" 2 | DB_PORT=5432 3 | DB_USER="usually_postgres" 4 | DB_NAME="db_name_like_lens_bigquery" 5 | DB_PASSWORD="db_password" 6 | 7 | REMOTE_DB_HOST="ip.address.or.host" 8 | REMOTE_DB_PORT=9541 9 | 10 | TBL_CHANNEL_FIDS='DANGER_deletemefordefault_or_changeme' 11 | 12 | PERSONAL_IGRAPH_INPUT='PATH_TO_IG_PKL' 13 | PERSONAL_IGRAPH_URL='CHANGE_THIS_URL' 14 | 15 | IS_TEST='false' 16 | 17 | AIRFLOW_UID=0 18 | AIRFLOW_GID=0 19 | AIRFLOW__CORE__FERNET_KEY='changeme' 20 | 21 | SSH_KEY_PATH="changeme" 22 | DUNE_API_KEY="changeme" 23 | 24 | # Safe Defaults 25 | POSTGRES_TIMEOUT_SECS=60 26 | 27 | GO_EIGENTRUST_URL='http://localhost:8080' 28 | GO_EIGENTRUST_TIMEOUT_MS=600000 29 | GO_EIGENTRUST_BIND_SRC='/tmp' 30 | GO_EIGENTRUST_BIND_TARGET='/tmp' 31 | GO_EIGENTRUST_FILE_MODE='false' 32 | EIGENTRUST_ALPHA=0.5 33 | EIGENTRUST_EPSILON=1.0 34 | EIGENTRUST_MAX_ITER=50 35 | EIGENTRUST_FLAT_TAIL=2 36 | 37 | FRAMES_NAP_SECS=10 38 | FRAMES_SLEEP_SECS=300 39 | FRAMES_BATCH_SIZE=1000 40 | FRAMES_SCRAPE_CONCURRENCY=10 41 | FRAMES_SCRAPE_CONNECT_TIMEOUT_SECS=5 42 | FRAMES_SCRAPE_READ_TIMEOUT_SECS=10 43 | 44 | CASTS_SLEEP_SECS=10 45 | CASTS_BATCH_LIMIT=100000 46 | 47 | WARPCAST_CHANNELS_TIMEOUT_SECS=5 48 | CHANNEL_SLEEP_SECS=1 49 | 50 | 51 | LOG_LEVEL='INFO' 52 | LOG_FORMAT='[%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(funcName)s ] %(message)s' 53 | LOGURU_FORMAT='{time:YYYY-MM-DD HH:mm:ss} | {module}:{file}:{function}:{line} | {level} | {message}' 54 | LOG_PATH='/tmp/' 55 | -------------------------------------------------------------------------------- /pipeline/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:latest 2 | # Switch to root to install additional packages 3 | USER root 4 | 5 | # Fix potential permission issues and update package list 6 | RUN chmod -R a+rX /var/lib/apt/lists /var/cache/apt/archives && \ 7 | apt-get clean && \ 8 | rm -rf /var/lib/apt/lists/* && \ 9 | mkdir -p /var/lib/apt/lists/partial && \ 10 | apt-get update && \ 11 | apt-get -y install zip 12 | 13 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 14 | RUN unzip awscliv2.zip 15 | 16 | RUN ./aws/install 17 | 18 | # Switch back to the airflow user 19 | USER airflow 20 | 21 | # Set working directory 22 | WORKDIR /pipeline 23 | 24 | # Copy only the necessary files for initial setup 25 | COPY requirements.txt /pipeline/requirements.txt 26 | COPY .env /pipeline/.env 27 | 28 | # Source environment variables 29 | RUN /bin/bash -c "source /pipeline/.env" 30 | 31 | RUN pip install --upgrade pip 32 | 33 | RUN pip install -r /pipeline/requirements.txt 34 | RUN pip install apache-airflow-providers-pagerduty==3.7.2 discord-webhook==1.3.1 apache-airflow-providers-ssh==3.11.2 35 | 36 | 37 | -------------------------------------------------------------------------------- /pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Pre-requisites 2 | 1. Install [psql](https://www.timescale.com/blog/how-to-install-psql-on-mac-ubuntu-debian-windows/) on your local machine. 3 | 2. Run an instance of Postgres DB with data from Farcaster (installed locally or on a remote server) 4 | 3. Install [Python 3.12](https://www.python.org/downloads/) 5 | 4. Create a Python [virtualenv](https://docs.python.org/3/library/venv.html) somewhere on your machine - for example,`python3 -m venv .venv` will create a virtualenv in your current directory. 6 | 5. Copy/rename the `.env.sample` file into `.env` and update the details of the Postgres DB from step 2 and the virutalenv from step 3. 7 | 6. In case there is issues to create `.venv` add this code `rm -rf venv` `sudo apt install python3.12-venv` 8 | 9 | # Run the pipeline 10 | `sh run_pipeline.sh -w . -o /tmp/fc_graph` 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /pipeline/casts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/casts/__init__.py -------------------------------------------------------------------------------- /pipeline/casts/main_fetch_top_casters.py: -------------------------------------------------------------------------------- 1 | # standard dependencies 2 | import sys 3 | from datetime import date 4 | 5 | # local dependencies 6 | from config import settings 7 | import utils 8 | from . import cast_db_utils 9 | 10 | # 3rd party dependencies 11 | from dotenv import load_dotenv 12 | from loguru import logger 13 | from sqlalchemy import create_engine 14 | 15 | logger.remove() 16 | level_per_module = { 17 | "": settings.LOG_LEVEL, 18 | "silentlib": False 19 | } 20 | logger.add(sys.stdout, 21 | colorize=True, 22 | format=settings.LOGURU_FORMAT, 23 | filter=level_per_module, 24 | level=0) 25 | 26 | def main(): 27 | pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value() 28 | df = cast_db_utils.fetch_top_casters_df(logger, pg_dsn) 29 | # top_casters = [] 30 | # for caster in casters: 31 | # top_casters.append({'i': caster['i'], 'v': caster['v']}) 32 | 33 | # df = pd.DataFrame(data=top_casters) 34 | df["date_iso"] = date.today() 35 | logger.info(utils.df_info_to_string(df, with_sample=True)) 36 | 37 | postgres_engine = create_engine( 38 | settings.ALT_POSTGRES_URL.get_secret_value(), 39 | connect_args={"connect_timeout": settings.POSTGRES_TIMEOUT_SECS * 1_000}, 40 | ) 41 | logger.info(postgres_engine) 42 | with postgres_engine.connect() as connection: 43 | df.to_sql('k3l_top_casters', con=connection, if_exists='append', index=False) 44 | 45 | # cast_db_utils.insert_dune_table(settings.DUNE_API_KEY, 'openrank', 'top_caster', df) 46 | 47 | logger.info('top casters data updated to DB') 48 | 49 | # end while loop 50 | 51 | 52 | if __name__ == "__main__": 53 | load_dotenv() 54 | print(settings) 55 | 56 | # parser = argparse.ArgumentParser(description='Fetch top casters, persist the dataframe to db') 57 | # 58 | # parser.add_argument('-u', '--user') 59 | # parser.add_argument('-p', '--password') 60 | # parser.add_argument('-e', '--endpoint') 61 | # 62 | # args = parser.parse_args() 63 | 64 | logger.info('hello hello') 65 | main() 66 | -------------------------------------------------------------------------------- /pipeline/channels/Bot_Fids.csv: -------------------------------------------------------------------------------- 1 | FID,Username 2 | 262301,roundsbot 3 | 862591,cura-bot 4 | 864314,curabot 5 | 396644,hyperbot 6 | 861203,modbot 7 | 368422,automod 8 | 364927,paybot -------------------------------------------------------------------------------- /pipeline/channels/Seed_Fids.csv: -------------------------------------------------------------------------------- 1 | channel id,Seed Peers FIDs 2 | superrare,"9480,9480, 190045, 12299, 346769, 374498, 513681, 270678, 368422,12299, 190045, 270678, 346769, 374498, 513681, 9480" 3 | build,"8446, 195255, 221216, 6730, 9856, 4461, 1214, 9816, 15732, 399485, 16085, 14351, 99" 4 | memes,"576, 3, 2, 3621, 239, 457, 347, 557, 4407, 1287, 1325" 5 | dev,"191, 6841" 6 | louder,"238853,15696, 206, 403020, 395131, 508334, 477292" 7 | wildcardclub,"4914, 7791" 8 | -------------------------------------------------------------------------------- /pipeline/channels/Trending_Channels.csv: -------------------------------------------------------------------------------- 1 | ChannelID 2 | zora 3 | farcaster 4 | itookaphoto 5 | memes 6 | replyguys 7 | farville 8 | degen 9 | nature 10 | sense 11 | food 12 | jobs 13 | lifeisgood 14 | anime-manga 15 | football 16 | higher 17 | dickbutt 18 | art 19 | talent 20 | brypto 21 | dickbutt 22 | six 23 | vibely 24 | screens 25 | nba -------------------------------------------------------------------------------- /pipeline/channels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/channels/__init__.py -------------------------------------------------------------------------------- /pipeline/channels/main_metrics.py: -------------------------------------------------------------------------------- 1 | # standard dependencies 2 | import sys 3 | import argparse 4 | import datetime 5 | 6 | # local dependencies 7 | from config import settings 8 | from . import channel_db_utils 9 | from .channel_db_utils import Metric 10 | 11 | # 3rd party dependencies 12 | from dotenv import load_dotenv 13 | from loguru import logger 14 | 15 | # Configure logger 16 | logger.remove() 17 | level_per_module = { 18 | "": settings.LOG_LEVEL, 19 | "silentlib": False 20 | } 21 | logger.add(sys.stdout, 22 | colorize=True, 23 | format=settings.LOGURU_FORMAT, 24 | filter=level_per_module, 25 | level=0) 26 | 27 | load_dotenv() 28 | 29 | def main(): 30 | # Metrics only available in Eigen 8 31 | pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value() 32 | sql_timeout_ms = 120_000 33 | channel_db_utils.upsert_weekly_metrics(logger, pg_dsn, sql_timeout_ms, Metric.WEEKLY_NUM_CASTS) 34 | channel_db_utils.upsert_weekly_metrics(logger, pg_dsn, sql_timeout_ms, Metric.WEEKLY_UNIQUE_CASTERS) 35 | 36 | if __name__ == "__main__": 37 | 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument( 40 | "--run", 41 | action="store_true", 42 | help="dummy arg to prevent accidental execution", 43 | required=True 44 | ) 45 | parser.add_argument( 46 | "--dry-run", 47 | help="indicate dry-run mode", 48 | action="store_true" 49 | ) 50 | args = parser.parse_args() 51 | print(args) 52 | logger.info(settings) 53 | 54 | if args.dry_run: 55 | settings.IS_TEST = True 56 | 57 | main() -------------------------------------------------------------------------------- /pipeline/channels/openrank_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import subprocess 3 | import os 4 | import tempfile 5 | 6 | from config import settings 7 | 8 | from loguru import logger 9 | 10 | def download_results(req_id: str, toml_file: Path, out_dir:Path, out_file: Path): 11 | new_env = os.environ.copy() 12 | new_env['SECRET_KEY'] = settings.OPENRANK_REQ_SECRET_KEY.get_secret_value() 13 | get_cmd = subprocess.run( 14 | ["openrank-sdk", "get-results", str(req_id), str(toml_file), str(out_file)], 15 | stdout=subprocess.DEVNULL, 16 | stderr=subprocess.PIPE, 17 | text=True, 18 | timeout=settings.OPENRANK_TIMEOUT_SECS, 19 | env=new_env, 20 | check=True, 21 | ) 22 | if get_cmd.returncode != 0: 23 | logger.error(f"OpenRank get-results failed for {req_id}: {get_cmd.stderr}") 24 | raise Exception("OpenRank get-results failed") 25 | logger.info(f"OpenRank get-results for {req_id} downloaded to: {out_file}") 26 | 27 | def update_and_compute(lt_file: Path, pt_file: Path, toml_file: Path) -> str: 28 | new_env = os.environ.copy() 29 | new_env['SECRET_KEY'] = settings.OPENRANK_REQ_SECRET_KEY.get_secret_value() 30 | 31 | lt_cmd = subprocess.run( 32 | ["openrank-sdk", "trust-update", str(lt_file), str(toml_file)], 33 | stdout=subprocess.PIPE, 34 | stderr=subprocess.STDOUT, 35 | text=True, 36 | # check=True, # we don't want to throw error until we have a chance to print the output 37 | timeout=settings.OPENRANK_TIMEOUT_SECS, 38 | env=new_env, 39 | ) 40 | logger.info(f"OpenRank trust-update output: {lt_cmd}") 41 | if lt_cmd.returncode != 0: 42 | logger.error(f"OpenRank trust-update failed: {lt_cmd.stdout}") 43 | raise Exception("OpenRank trust-update failed") 44 | pt_cmd = subprocess.run( 45 | ["openrank-sdk", "seed-update", str(pt_file), str(toml_file)], 46 | stdout=subprocess.PIPE, 47 | stderr=subprocess.STDOUT, 48 | text=True, 49 | timeout=settings.OPENRANK_TIMEOUT_SECS, 50 | env=new_env, 51 | ) 52 | logger.info(f"OpenRank seed-update output: {pt_cmd}") 53 | if pt_cmd.returncode != 0: 54 | logger.error(f"OpenRank seed-update failed: {pt_cmd.stdout}") 55 | raise Exception("OpenRank seed-update failed") 56 | compute_cmd = subprocess.run( 57 | ["openrank-sdk", "compute-request", str(toml_file)], 58 | stdout=subprocess.PIPE, 59 | stderr=subprocess.STDOUT, 60 | text=True, 61 | timeout=settings.OPENRANK_TIMEOUT_SECS, 62 | env=new_env, 63 | ) 64 | logger.info(f"OpenRank compute output: {compute_cmd}") 65 | if compute_cmd.returncode != 0: 66 | logger.error(f"OpenRank compute failed: {compute_cmd.stdout}") 67 | raise Exception("OpenRank compute failed") 68 | req_id = compute_cmd.stdout.strip() 69 | logger.info(f"OpenRank request id: {req_id}") 70 | return req_id 71 | -------------------------------------------------------------------------------- /pipeline/dag_utils/clear_task_instance.py: -------------------------------------------------------------------------------- 1 | from airflow import settings 2 | from airflow.models import DagRun, TaskInstance 3 | from airflow.utils.state import State 4 | 5 | # Define your variables 6 | dag_id = "gen_personal_graph_replica_v1" 7 | task_id = "process_channel_chunk" 8 | run_id = "manual__2024-07-22T06:46:15.813325+00:00" 9 | map_index_start = 908 # 908 430 10 | map_index_end = 939 # 939 907 11 | 12 | # Get the session 13 | session = settings.Session() 14 | 15 | # Query the DagRun 16 | dag_run = session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == run_id).one() 17 | 18 | # Loop through the range of map indexes and clear each task instance 19 | for map_index in range(map_index_start, map_index_end + 1): 20 | try: 21 | # Query the TaskInstance 22 | task_instance = session.query(TaskInstance).filter( 23 | TaskInstance.dag_id == dag_id, 24 | TaskInstance.task_id == task_id, 25 | TaskInstance.run_id == run_id, 26 | TaskInstance.map_index == map_index 27 | ).one() 28 | 29 | # Clear the task instance 30 | task_instance.set_state(State.SUCCESS, session=session) 31 | print(f"Cleared task {task_id} with map index {map_index} for DAG {dag_id} and run ID {run_id}") 32 | except Exception as e: 33 | print(f"Could not clear task {task_id} with map index {map_index}: {e}") 34 | 35 | # Commit the changes 36 | session.commit() 37 | print(f"Cleared tasks {task_id} with map indexes from {map_index_start} to {map_index_end} for DAG {dag_id} and run ID {run_id}") 38 | 39 | -------------------------------------------------------------------------------- /pipeline/dag_utils/combine_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import re 4 | 5 | # Specify the directory containing the CSV files 6 | directory = 'backup/' 7 | 8 | # Specify the output file 9 | output_file = 'combined_dataset.csv' 10 | 11 | # Function to extract numeric offset from filename 12 | def extract_offset(filename): 13 | match = re.search(r'offset_(\d+)', filename) 14 | return int(match.group(1)) if match else 0 15 | 16 | # Get list of files sorted by numeric offset 17 | files = sorted( 18 | (f for f in os.listdir(directory) if f.startswith('karma3-labs.dataset_k3l_cast_localtrust_offset_') and f.endswith('.csv')), 19 | key=extract_offset 20 | ) 21 | 22 | # Initialize a flag to handle headers 23 | header_saved = False 24 | 25 | # Open the output file in write mode 26 | with open(output_file, 'w', newline='') as outfile: 27 | csv_writer = csv.writer(outfile) 28 | 29 | # Iterate over each sorted file 30 | for filename in files: 31 | file_path = os.path.join(directory, filename) 32 | 33 | # Open each CSV file in read mode 34 | with open(file_path, 'r') as infile: 35 | csv_reader = csv.reader(infile) 36 | 37 | # Iterate over the rows in the input file 38 | for i, row in enumerate(csv_reader): 39 | # Write the header only once 40 | if i == 0: 41 | if not header_saved: 42 | csv_writer.writerow(row) 43 | header_saved = True 44 | else: 45 | # Skip empty rows 46 | if any(cell.strip() for cell in row): 47 | csv_writer.writerow(row) 48 | 49 | print(f'Combined CSV file saved as {output_file}') -------------------------------------------------------------------------------- /pipeline/dag_utils/dune_backup.py: -------------------------------------------------------------------------------- 1 | import urllib3 2 | from concurrent.futures import ThreadPoolExecutor, as_completed 3 | 4 | import time 5 | import random 6 | 7 | http = urllib3.PoolManager() 8 | 9 | def download_csv(limit: int, offset: int, table_name: str): 10 | """ 11 | Download CSV data from the backend server. 12 | 13 | Args: 14 | endpoint (str): The endpoint for the download. 15 | 16 | Returns: 17 | List[dict]: List of downloaded data. 18 | 19 | Example: 20 | data = et._download_csv('localtrust/123') 21 | """ 22 | print(f'limt={limit}, offset={offset}') 23 | jitter = random.uniform(0.01, 1) 24 | time.sleep(jitter) 25 | 26 | response = http.request( 27 | 'GET', 28 | f'https://api.dune.com/api/v1/query/3832819/results/csv?limit={limit}&offset={offset}', 29 | headers={ 30 | 'Accept': 'text/csv', 31 | 'Content-Type':'text/csv', 32 | 'X-DUNE-API-KEY': '7QYqrqNvGVJJuwMybzxfh1sbR8qXFbDI', 33 | }, 34 | preload_content=False 35 | ) 36 | if response.status != 200: 37 | raise Exception(f"Failed to download CSV: {response.data.decode('utf-8')}") 38 | 39 | # data = response.data.decode('utf-8') 40 | # print(data) 41 | filename = f'backup/{table_name}_offset_{offset}_limit_{limit}.csv' 42 | with open(filename, 'wb') as out_file: 43 | # print(data) 44 | # data = response.read() # a `bytes` object 45 | out_file.write(response.data) 46 | 47 | # shutil.copyfileobj(response, out_file) 48 | # out_file.write(response) 49 | print(f'wrote {filename}') 50 | 51 | 52 | 53 | limit = 30000 54 | # next = limit 55 | offset = 0 56 | 57 | start = 0 58 | stop = 382500000 59 | step = limit 60 | incremental_array = list(range(start, stop + step, step)) 61 | 62 | # print(incremental_array[:100]) 63 | num_workers = 25 64 | table_name = "karma3-labs.dataset_k3l_cast_localtrust" 65 | # Use ThreadPoolExecutor to make parallel HTTP requests 66 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 67 | future_to_value = {executor.submit(download_csv, limit, value, table_name): value for value in incremental_array} 68 | 69 | for future in as_completed(future_to_value): 70 | value = future_to_value[future] 71 | try: 72 | future.result() 73 | except Exception as exc: 74 | print(f'Value {value} generated an exception: {exc}') 75 | 76 | print("All requests completed.") -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_automod.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.bash_operator import BashOperator 3 | from airflow.models import Variable 4 | from hooks.discord import send_alert_discord 5 | from hooks.pagerduty import send_alert_pagerduty 6 | from datetime import datetime, timedelta 7 | 8 | 9 | api_key = Variable.get("API_KEY", default_var="api_key") 10 | db_endpoint = Variable.get('DB_ENDPOINT', default_var="test") 11 | db_user = Variable.get('DB_USER', default_var="test") 12 | db_password = Variable.get('DB_PASSWORD', default_var="test") 13 | 14 | 15 | default_args = { 16 | 'owner': 'coder2j', 17 | 'retries': 1, 18 | 'retry_delay': timedelta(minutes=5) 19 | } 20 | 21 | with DAG( 22 | 'extract_automod_api_to_db', 23 | default_args=default_args, 24 | description='Fetch data from AUTOMOD API and load into DB daily', 25 | # schedule_interval=timedelta(days=1), 26 | schedule_interval=None, 27 | start_date=datetime(2024, 9, 4), 28 | is_paused_upon_creation=True, 29 | max_active_runs=1, 30 | catchup=False, 31 | ) as dag: 32 | fetch_data_from_automod = BashOperator( 33 | task_id='fetch_automod_data_from_api', 34 | bash_command=f"cd /pipeline/extractors ; python3 automod_extractor.py {api_key} { db_user } { db_password } { db_endpoint }" 35 | ) 36 | 37 | fetch_data_from_automod 38 | -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_backup_sandbox_db.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | from airflow.operators.bash import BashOperator 8 | 9 | from hooks.discord import send_alert_discord 10 | from hooks.pagerduty import send_alert_pagerduty 11 | 12 | default_args = { 13 | 'owner': 'coder2j', 14 | 'retries': 5, 15 | 'retry_delay': timedelta(minutes=2), 16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 17 | } 18 | 19 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path") 20 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path") 21 | 22 | with DAG( 23 | dag_id='dag_backup_sandbox_db_v0', 24 | default_args=default_args, 25 | description='sync the db table of the sandboxed read replica', 26 | start_date=datetime(2024, 8, 10, 18), 27 | # schedule_interval='0 0 * * *', # backup everyday 28 | schedule_interval=None, # backup everyday 29 | catchup=False, 30 | ) as dag: 31 | 32 | 33 | # ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None) 34 | 35 | # run_sandbox_backup = SSHOperator( 36 | # task_id="run_sandbox_backup_v0", 37 | # command=f"cd {sandbox_db_sync_path}; ./run-backup.sh ", 38 | # ssh_hook=ssh_hook, 39 | # dag=dag) 40 | 41 | run_sandbox_backup = BashOperator( 42 | task_id='run_sandbox_backup', 43 | bash_command="cd /pipeline && ./run_sandbox_backup.sh " 44 | ) 45 | 46 | run_sandbox_backup 47 | -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_copy_graph_files_to_sandbox_dev_v1.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.operators.bash import BashOperator 6 | from airflow.providers.ssh.operators.ssh import SSHHook 7 | from airflow.providers.ssh.operators.ssh import SSHOperator 8 | from airflow.sensors.external_task import ExternalTaskSensor 9 | 10 | from hooks.discord import send_alert_discord 11 | from hooks.pagerduty import send_alert_pagerduty 12 | 13 | default_args = { 14 | "owner": "coder2j", 15 | "retries": 5, 16 | "retry_delay": timedelta(minutes=2), 17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty], 18 | } 19 | 20 | dev_sandbox_pipeline_path = Variable.get("dev_sandbox_pipeline_path") 21 | data_backup_s3_bucket = Variable.get("data_backup_s3_bucket") 22 | 23 | with DAG( 24 | dag_id="copy_graph_files_to_sandbox_dev_v2", 25 | default_args=default_args, 26 | description="re-generate graph for farcaster-graph API server. copy re-generated all graph files to dev sandbox from backup s3", 27 | start_date=datetime(2024, 7, 9, 18), 28 | # schedule_interval="0 0 * * *", 29 | schedule_interval=None, 30 | is_paused_upon_creation=True, 31 | max_active_runs=1, 32 | catchup=False, 33 | ) as dag: 34 | 35 | ssh_hook = SSHHook(ssh_conn_id='sandbox_staging', keepalive_interval=60, cmd_timeout=None) 36 | 37 | download_pqt_file = SSHOperator( 38 | task_id="download_pqt_file_v1", 39 | command=f"cd {dev_sandbox_pipeline_path}; ./run_graph_pipeline.sh -o /data/serve_files -s {data_backup_s3_bucket} ", 40 | ssh_hook=ssh_hook, 41 | dag=dag, 42 | ) 43 | 44 | download_pqt_file -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_degen_tips_processing.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.bash import BashOperator 4 | from airflow.operators.python import PythonOperator 5 | from hooks.discord import send_alert_discord 6 | from hooks.pagerduty import send_alert_pagerduty 7 | 8 | default_args = { 9 | 'owner': 'coder2j', 10 | 'retries': 5, 11 | 'retry_delay': timedelta(minutes=2), 12 | # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 13 | } 14 | 15 | with DAG( 16 | dag_id='dag_degen_tips_processing_v0', 17 | default_args=default_args, 18 | description='Process DEGEN tips from casts', 19 | start_date=datetime(2024, 7, 9, 18), 20 | # schedule_interval='*/10 * * * *', # Run every 10 minutes 21 | schedule_interval=None, 22 | catchup=False, 23 | ) as dag: 24 | task_update_degen_tips = BashOperator( 25 | task_id='update_degen_tips_v0', 26 | bash_command='''cd /pipeline/ && ./run_create_degen_db_functions.sh -v .venv -t extract 27 | ''' 28 | ) 29 | 30 | task_analyze_degen_tips = BashOperator( 31 | task_id='analyze_degen_tips_v0', 32 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . " 33 | ANALYZE k3l_degen_tips; 34 | ANALYZE k3l_cast_action;" 35 | ''' 36 | ) 37 | 38 | # Set up the task dependencies 39 | task_update_degen_tips >> task_analyze_degen_tips -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_gen_personal_graph_replica_v0.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'coder2j', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | with DAG( 19 | dag_id='gen_personal_graph_replica_v0', 20 | default_args=default_args, 21 | description='Every hour, try running personal graph script on eigen7 replica. Script has internal check for 36 hours', 22 | start_date=datetime(2024, 7, 9, 18), 23 | # schedule_interval='0 * * * *', 24 | schedule_interval=None, 25 | catchup=False, 26 | ) as dag: 27 | ssh_hook = SSHHook(ssh_conn_id='eigen7', keepalive_interval=60, cmd_timeout=None) 28 | 29 | eigen7_copy_localtrust_csv_files = SSHOperator( 30 | task_id="eigen7_gen_personal_graph", 31 | command=f"cd ~/farcaster-graph/pipeline; ./run_personal_graph_pipeline.sh -i ~/serve_files/lt_l1rep6rec3m12enhancedConnections_fid.csv -o ~/wip_files/ -w . -v .venv -s k3l-openrank-farcaster -l /var/log/farcaster-graph/ ", 32 | ssh_hook=ssh_hook, 33 | dag=dag) 34 | 35 | eigen7_copy_localtrust_csv_files 36 | -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_insert_degen_ranking_v0.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.bash import BashOperator 4 | from airflow.operators.python import PythonOperator 5 | from hooks.discord import send_alert_discord 6 | from hooks.pagerduty import send_alert_pagerduty 7 | 8 | default_args = { 9 | 'owner': 'coder2j', 10 | 'retries': 5, 11 | 'retry_delay': timedelta(minutes=2), 12 | # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 13 | } 14 | 15 | with DAG( 16 | dag_id='dag_degen_insert_ranking_v0', 17 | default_args=default_args, 18 | description='Process DEGEN tips from casts', 19 | start_date=datetime(2024, 7, 9, 18), 20 | # schedule_interval='10 */6 * * *', 21 | schedule_interval=None, 22 | catchup=False, 23 | ) as dag: 24 | 25 | task_update_degen_tips = BashOperator( 26 | task_id='update_degen_tips_v0', 27 | bash_command='''cd /pipeline/ && ./run_create_degen_db_functions.sh -v .venv -t insert_scores 28 | ''' 29 | ) 30 | 31 | task_analyze_degen_tips = BashOperator( 32 | task_id='analyze_degen_tips_v0', 33 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . " 34 | ANALYZE k3l_degen_tips; 35 | ANALYZE k3l_cast_action;" 36 | ''' 37 | ) 38 | 39 | # Set up the task dependencies 40 | task_update_degen_tips >> task_analyze_degen_tips -------------------------------------------------------------------------------- /pipeline/dags/archived/dag_run_frame_pipeline_v0.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | from hooks.discord import send_alert_discord 7 | from hooks.pagerduty import send_alert_pagerduty 8 | 9 | default_args = { 10 | 'owner': 'coder2j', 11 | 'retries': 5, 12 | 'retry_delay': timedelta(minutes=2), 13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 14 | } 15 | 16 | with DAG( 17 | dag_id='extract_frame_url_v0', 18 | default_args=default_args, 19 | description='Extract urls from cast embeds for frames and refresh pg statistics', 20 | start_date=datetime(2024, 7, 9, 18), 21 | # schedule_interval='1-59/20 * * * *', 22 | # Decommission Frames ranking due to lack of usage 23 | # ... and relevance with the introduction of Frames V2 by Warpcast 24 | # schedule_interval=timedelta(minutes=20), 25 | schedule_interval=None, 26 | is_paused_upon_creation=True, 27 | max_active_runs=1, 28 | catchup=False, 29 | ) as dag: 30 | task1 = BashOperator( 31 | task_id='run_urlextract_pipeline', 32 | bash_command='cd /pipeline/ && ./run_urlextract_pipeline.sh -w . ' 33 | ) 34 | 35 | task2 = BashOperator( 36 | task_id='run_frame_scraper', 37 | bash_command='cd /pipeline/ && ./run_frame_scraper.sh -v ./.venv/ ' 38 | ) 39 | 40 | task3 = BashOperator( 41 | task_id='analyze_url_labels_and_mapping', 42 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . " 43 | ANALYZE k3l_url_labels; ANALYZE k3l_cast_embed_url_mapping;" 44 | ''' 45 | ) 46 | 47 | task4 = BashOperator( 48 | task_id='refresh_k3l_frame_interaction', 49 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . " 50 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_frame_interaction;" 51 | ''' 52 | ) 53 | 54 | # task5 = BashOperator( 55 | # task_id='vacuum_k3l_frame_interaction', 56 | # bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . " 57 | # VACUUM ANALYZE k3l_recent_frame_interaction;" 58 | # ''' 59 | # ) 60 | 61 | # task1 >> task2 >> task3 >> task4 >> task5 62 | task1 >> task2 >> task3 >> task4 63 | 64 | -------------------------------------------------------------------------------- /pipeline/dags/archived/extractors/dag_warpcast_channel_followers.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | from hooks.discord import send_alert_discord 7 | from hooks.pagerduty import send_alert_pagerduty 8 | 9 | default_args = { 10 | "owner": "karma3labs", 11 | "retries": 1, 12 | "retry_delay": timedelta(minutes=5), 13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 14 | } 15 | 16 | with DAG( 17 | "extract_warpcast_followers", 18 | default_args=default_args, 19 | description="Fetch channel followers from WARPCAST API and load into DB daily", 20 | schedule_interval=timedelta(days=1), 21 | start_date=datetime(2024, 8, 1), 22 | is_paused_upon_creation=True, 23 | max_active_runs=1, 24 | catchup=False, 25 | ) as dag: 26 | 27 | prep_task = BashOperator( 28 | task_id='prep_warpcast_followers', 29 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t prep" 30 | " -w . -v .venv -j followers", 31 | dag=dag 32 | ) 33 | 34 | fetch_task = BashOperator( 35 | task_id='extract_channel_followers', 36 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t fetch" 37 | " -w . -v .venv -c channels/Top_Channels.csv -s top -j followers", 38 | dag=dag 39 | ) 40 | 41 | cleanup_task = BashOperator( 42 | task_id='cleanup_warpcast_followers', 43 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t cleanup" 44 | " -w . -v .venv -j followers", 45 | dag=dag 46 | ) 47 | 48 | prep_task >> fetch_task >> cleanup_task -------------------------------------------------------------------------------- /pipeline/dags/archived/extractors/dag_warpcast_channel_members.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | from hooks.discord import send_alert_discord 7 | from hooks.pagerduty import send_alert_pagerduty 8 | 9 | default_args = { 10 | "owner": "karma3labs", 11 | "retries": 1, 12 | "retry_delay": timedelta(minutes=5), 13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 14 | } 15 | 16 | with DAG( 17 | "extract_warpcast_members", 18 | default_args=default_args, 19 | description="Fetch channel members from WARPCAST API and load into DB daily", 20 | schedule_interval=timedelta(hours=1), 21 | start_date=datetime(2024, 8, 1), 22 | is_paused_upon_creation=True, 23 | max_active_runs=1, 24 | catchup=False, 25 | ) as dag: 26 | 27 | prep_task = BashOperator( 28 | task_id='prep_warpcast_members', 29 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t prep" 30 | " -w . -v .venv -j members", 31 | dag=dag 32 | ) 33 | 34 | fetch_task = BashOperator( 35 | task_id='fetch_warpcast_members', 36 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t fetch" 37 | " -w . -v .venv -c channels/Top_Channels.csv -s top -j members", 38 | dag=dag 39 | ) 40 | 41 | cleanup_task = BashOperator( 42 | task_id='cleanup_warpcast_members', 43 | bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t cleanup" 44 | " -w . -v .venv -j members", 45 | dag=dag 46 | ) 47 | 48 | prep_task >> fetch_task >> cleanup_task -------------------------------------------------------------------------------- /pipeline/dags/archived/extractors/dag_warpcast_channels.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.bash_operator import BashOperator 3 | from airflow.models import Variable 4 | from hooks.discord import send_alert_discord 5 | from hooks.pagerduty import send_alert_pagerduty 6 | from datetime import datetime, timedelta 7 | 8 | db_endpoint = Variable.get('DB_ENDPOINT', default_var="test") 9 | db_user = Variable.get('DB_USER', default_var="test") 10 | db_password = Variable.get('DB_PASSWORD', default_var="test") 11 | 12 | 13 | default_args = { 14 | 'owner': 'coder2j', 15 | 'retries': 1, 16 | 'retry_delay': timedelta(minutes=5) 17 | } 18 | 19 | with DAG( 20 | 'extract_warpcast_channels', 21 | default_args=default_args, 22 | description='Fetch channels metadata from WARPCAST API and load into DB daily', 23 | schedule_interval=timedelta(days=1), 24 | start_date=datetime(2024, 8, 19), 25 | is_paused_upon_creation=True, 26 | max_active_runs=1, 27 | catchup=False, 28 | ) as dag: 29 | fetch_data_from_warpcast = BashOperator( 30 | task_id='fetch_warpcast_data_from_api', 31 | bash_command="cd /pipeline; extractors/extract_channel_data.sh" 32 | " -w . -v .venv ", 33 | dag=dag 34 | ) 35 | 36 | fetch_data_from_warpcast 37 | -------------------------------------------------------------------------------- /pipeline/dags/archived/sandbox/dag_sync_sandbox_casts.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'coder2j', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path") 19 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path") 20 | 21 | with DAG( 22 | dag_id='sync_sandbox_db_casts', 23 | default_args=default_args, 24 | description='sync cast actions and parent casts to the sandbox', 25 | start_date=datetime(2024, 7, 10, 18), 26 | # schedule_interval='*/10 * * * *', 27 | # schedule_interval=timedelta(minutes=5), 28 | schedule=None, 29 | is_paused_upon_creation=True, 30 | max_active_runs=1, 31 | catchup=False, 32 | ) as dag: 33 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None) 34 | 35 | run_append = SSHOperator( 36 | task_id="run_append_v1", 37 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh ", 38 | ssh_hook=ssh_hook, 39 | dag=dag) 40 | 41 | run_remove = SSHOperator( 42 | task_id="run_remove_v0", 43 | command=f"cd {sandbox_db_sync_path}; ./2-run-remove.sh ", 44 | ssh_hook=ssh_hook, 45 | dag=dag) 46 | 47 | run_append >> run_remove 48 | 49 | -------------------------------------------------------------------------------- /pipeline/dags/archived/sandbox/dag_sync_sandbox_channel_fids.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'coder2j', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path") 19 | 20 | with DAG( 21 | dag_id='sync_sandbox_channel_fids', 22 | default_args=default_args, 23 | description='sync globaltrust to the sandbox', 24 | start_date=datetime(2024, 7, 10, 18), 25 | # schedule_interval='*/10 * * * *', 26 | schedule=None, 27 | is_paused_upon_creation=True, 28 | max_active_runs=1, 29 | catchup=False, 30 | ) as dag: 31 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None) 32 | 33 | run_append = SSHOperator( 34 | task_id="run_append_v1", 35 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -c ", 36 | ssh_hook=ssh_hook, 37 | dag=dag) 38 | 39 | run_refresh = SSHOperator( 40 | task_id="run_refresh_v0", 41 | command=f"cd {sandbox_db_sync_path}; ./4-run-refresh.sh -c ", 42 | ssh_hook=ssh_hook, 43 | dag=dag) 44 | 45 | run_append >> run_refresh 46 | 47 | -------------------------------------------------------------------------------- /pipeline/dags/archived/sandbox/dag_sync_sandbox_db_dev.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'coder2j', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path") 19 | 20 | with DAG( 21 | dag_id='dag_sync_sandbox_db_dev_v0', 22 | default_args=default_args, 23 | description='sync the db table of the sandboxed read replica', 24 | start_date=datetime(2024, 7, 10, 18), 25 | # schedule_interval='*/10 * * * *', 26 | schedule_interval=None, 27 | catchup=False, 28 | ) as dag: 29 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None) 30 | 31 | run_append_dev = SSHOperator( 32 | task_id="run_append_dev_v0", 33 | command=f"cd {dev_sandbox_db_sync_path}; ./1-run-append.sh -d 5 ", 34 | ssh_hook=ssh_hook, 35 | dag=dag) 36 | 37 | run_remove_dev = SSHOperator( 38 | task_id="run_remove_dev_v0", 39 | command=f"cd {dev_sandbox_db_sync_path}; ./2-run-remove.sh ", 40 | ssh_hook=ssh_hook, 41 | dag=dag) 42 | 43 | run_append_dev >> run_remove_dev 44 | 45 | -------------------------------------------------------------------------------- /pipeline/dags/archived/sandbox/dag_sync_sandbox_globaltrust.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'coder2j', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path") 19 | 20 | with DAG( 21 | dag_id='sync_sandbox_globaltrust', 22 | default_args=default_args, 23 | description='sync globaltrust to the sandbox', 24 | start_date=datetime(2024, 7, 10, 18), 25 | # schedule_interval='*/10 * * * *', 26 | schedule=None, 27 | is_paused_upon_creation=True, 28 | max_active_runs=1, 29 | catchup=False, 30 | ) as dag: 31 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None) 32 | 33 | run_append = SSHOperator( 34 | task_id="run_append_v1", 35 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -g ", 36 | ssh_hook=ssh_hook, 37 | dag=dag) 38 | 39 | run_refresh = SSHOperator( 40 | task_id="run_refresh_v0", 41 | command=f"cd {sandbox_db_sync_path}; ./4-run-refresh.sh -g ", 42 | ssh_hook=ssh_hook, 43 | dag=dag) 44 | 45 | run_append >> run_refresh 46 | 47 | -------------------------------------------------------------------------------- /pipeline/dags/archived/sandbox/dag_sync_sandbox_labels.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.contrib.operators.ssh_operator import SSHOperator 6 | from airflow.contrib.hooks.ssh_hook import SSHHook 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'coder2j', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path") 19 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path") 20 | 21 | with DAG( 22 | dag_id='sync_sandbox_db_labels', 23 | default_args=default_args, 24 | description='sync labels to the sandbox', 25 | start_date=datetime(2024, 7, 10, 18), 26 | # schedule_interval='*/10 * * * *', 27 | schedule=None, 28 | is_paused_upon_creation=True, 29 | max_active_runs=1, 30 | catchup=False, 31 | ) as dag: 32 | ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None) 33 | 34 | run_append = SSHOperator( 35 | task_id="run_append_v1", 36 | command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -l", 37 | ssh_hook=ssh_hook, 38 | dag=dag) 39 | 40 | run_append 41 | 42 | -------------------------------------------------------------------------------- /pipeline/dags/cura/dag_direct_cast_join_requests.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.operators.bash import BashOperator 6 | from airflow.providers.ssh.operators.ssh import SSHHook 7 | from airflow.providers.ssh.operators.ssh import SSHOperator 8 | from airflow.decorators import task_group 9 | 10 | from hooks.discord import send_alert_discord 11 | from hooks.pagerduty import send_alert_pagerduty 12 | 13 | default_args = { 14 | "owner": "coder2j", 15 | "retries": 5, 16 | "retry_delay": timedelta(minutes=2), 17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty], 18 | } 19 | 20 | HOST_REPO_URL='cura-bot-2' 21 | 22 | with DAG( 23 | dag_id="cura_direct_cast_join_requests", 24 | default_args=default_args, 25 | description="Direct cast join requests from curabot", 26 | start_date=datetime(2024, 11, 7), 27 | schedule_interval='0 * * * *', 28 | is_paused_upon_creation=True, 29 | max_active_runs=1, 30 | catchup=False, 31 | ) as dag: 32 | 33 | ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None) 34 | 35 | eigen1_install_dependencies = SSHOperator( 36 | task_id="cura_eigen1_install_deps", 37 | command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i", 38 | ssh_hook=ssh_hook, 39 | dag=dag, 40 | ) 41 | 42 | eigen1_direct_cast_join_requests = SSHOperator( 43 | task_id="cura_eigen1_direct_cast_join_requests", 44 | command=f"cd {HOST_REPO_URL} && npm run script:direct_cast_join_requests", 45 | ssh_hook=ssh_hook, 46 | dag=dag, 47 | ) 48 | 49 | eigen1_install_dependencies >> eigen1_direct_cast_join_requests 50 | 51 | -------------------------------------------------------------------------------- /pipeline/dags/cura/dag_run_autoinvite_rules.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.operators.bash import BashOperator 6 | from airflow.providers.ssh.operators.ssh import SSHHook 7 | from airflow.providers.ssh.operators.ssh import SSHOperator 8 | from airflow.decorators import task_group 9 | 10 | from hooks.discord import send_alert_discord 11 | from hooks.pagerduty import send_alert_pagerduty 12 | 13 | default_args = { 14 | "owner": "coder2j", 15 | "retries": 5, 16 | "retry_delay": timedelta(minutes=2), 17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty], 18 | } 19 | 20 | HOST_REPO_URL='cura-bot-3' 21 | 22 | with DAG( 23 | dag_id="cura_run_autoinvite_rules", 24 | default_args=default_args, 25 | description="Run all the autoinvite rules", 26 | start_date=datetime(2024, 11, 7), 27 | schedule_interval='0 */4 * * *', 28 | is_paused_upon_creation=True, 29 | max_active_runs=1, 30 | catchup=False, 31 | ) as dag: 32 | 33 | ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None) 34 | 35 | eigen1_install_dependencies = SSHOperator( 36 | task_id="cura_eigen1_install_deps", 37 | command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i", 38 | ssh_hook=ssh_hook, 39 | dag=dag, 40 | ) 41 | 42 | eigen1_run_autoinvite = SSHOperator( 43 | task_id="cura_eigen1_run_autoinvite", 44 | command=f"cd {HOST_REPO_URL} && npm run script:autoinvite", 45 | ssh_hook=ssh_hook, 46 | dag=dag, 47 | ) 48 | 49 | eigen1_install_dependencies >> eigen1_run_autoinvite 50 | -------------------------------------------------------------------------------- /pipeline/dags/cura/dag_run_quote_casts.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.models import Variable 5 | from airflow.operators.bash import BashOperator 6 | from airflow.providers.ssh.operators.ssh import SSHHook 7 | from airflow.providers.ssh.operators.ssh import SSHOperator 8 | from airflow.decorators import task_group 9 | 10 | from hooks.discord import send_alert_discord 11 | from hooks.pagerduty import send_alert_pagerduty 12 | 13 | default_args = { 14 | "owner": "coder2j", 15 | "retries": 5, 16 | "retry_delay": timedelta(minutes=2), 17 | "on_failure_callback": [send_alert_discord, send_alert_pagerduty], 18 | } 19 | 20 | HOST_REPO_URL='cura-bot-1' 21 | 22 | with DAG( 23 | dag_id="cura_run_quote_casts", 24 | default_args=default_args, 25 | description="Quote a cast and post it from curabot", 26 | start_date=datetime(2024, 11, 7), 27 | schedule_interval='0 0 * * 5', 28 | is_paused_upon_creation=True, 29 | max_active_runs=1, 30 | catchup=False, 31 | ) as dag: 32 | 33 | ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None) 34 | 35 | eigen1_install_dependencies = SSHOperator( 36 | task_id="cura_eigen1_install_deps", 37 | command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i", 38 | ssh_hook=ssh_hook, 39 | dag=dag, 40 | ) 41 | 42 | eigen1_run_quote_casts = SSHOperator( 43 | task_id="cura_eigen1_run_quote_casts", 44 | command=f"cd {HOST_REPO_URL} && npm run script:quote_casts", 45 | ssh_hook=ssh_hook, 46 | dag=dag, 47 | ) 48 | 49 | eigen1_install_dependencies >> eigen1_run_quote_casts 50 | 51 | -------------------------------------------------------------------------------- /pipeline/dags/dag_backup_to_s3_v1.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from airflow.sensors.external_task import ExternalTaskSensor 6 | 7 | from hooks.discord import send_alert_discord 8 | from hooks.pagerduty import send_alert_pagerduty 9 | 10 | default_args = { 11 | 'owner': 'coder2j', 12 | 'retries': 5, 13 | 'retry_delay': timedelta(minutes=2), 14 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 15 | } 16 | 17 | 18 | with DAG( 19 | dag_id='backup_to_s3_v1', 20 | default_args=default_args, 21 | description='This backs up globaltrust, localtrust and channel_ranking into s3', 22 | start_date=datetime(2024, 8, 15), 23 | schedule_interval='30 20 * * *', 24 | catchup=False, 25 | ) as dag: 26 | 27 | task1 = BashOperator( 28 | task_id='backup_globaltrust', 29 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh globaltrust" 30 | ) 31 | 32 | task2 = BashOperator( 33 | task_id='backup_globaltrust_config', 34 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh globaltrust_config" 35 | ) 36 | 37 | task3 = BashOperator( 38 | task_id='backup_localtrust', 39 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh localtrust_v1 /pipeline/tmp/graph_files" 40 | ) 41 | 42 | [task1, task2, task3] 43 | 44 | -------------------------------------------------------------------------------- /pipeline/dags/dag_notify_channel_daily_trending.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | import pytz 3 | 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | from airflow.operators.empty import EmptyOperator 7 | from airflow.decorators import task 8 | 9 | from hooks.discord import send_alert_discord 10 | from hooks.pagerduty import send_alert_pagerduty 11 | 12 | default_args = { 13 | 'owner': 'karma3labs', 14 | 'retries': 5, 15 | 'retry_delay': timedelta(minutes=2), 16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 17 | } 18 | 19 | def _9ampacific_in_utc_time(): 20 | pacific_tz = pytz.timezone('US/Pacific') 21 | pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00']) 22 | pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S')) 23 | utc_time = pacific_time.astimezone(pytz.utc) 24 | return utc_time 25 | 26 | with DAG( 27 | dag_id='notify_channel_daily_trending', 28 | default_args=default_args, 29 | description='daily notifications for trending channels', 30 | start_date=datetime(2024, 7, 10, 18), 31 | schedule_interval='30 16 * * *', # every day at 16:30/17:30 UTC / 09:30 Pacific 32 | is_paused_upon_creation=True, 33 | max_active_runs=1, 34 | catchup=False, 35 | ) as dag: 36 | 37 | skip_notify = EmptyOperator(task_id="skip_notify") 38 | 39 | notify = BashOperator( 40 | task_id="notify", 41 | bash_command=( 42 | "cd /pipeline && ./run_notify_channel_daily_trending.sh " 43 | " -w . -v .venv -c channels/Trending_Channels.csv "), 44 | dag=dag) 45 | 46 | @task.branch(task_id="check_last_successful") 47 | def check_last_successful(**context) -> bool: 48 | now = datetime.now(pytz.utc) 49 | prev_run_date = context['prev_data_interval_end_success'] 50 | daily_run = _9ampacific_in_utc_time() 51 | print(f"now: {now}, prev_run_date: {prev_run_date}, daily_run: {daily_run}") 52 | if ( 53 | now > daily_run 54 | and (prev_run_date is None or prev_run_date < daily_run) 55 | ): 56 | # Last successful run was before today, so we should run 57 | print(f"Last run {prev_run_date} was before {daily_run}, so we should run") 58 | return "notify" 59 | return "skip_notify" 60 | 61 | check_last_successful = check_last_successful() 62 | 63 | check_last_successful >> skip_notify 64 | 65 | check_last_successful >> notify 66 | 67 | -------------------------------------------------------------------------------- /pipeline/dags/dag_notify_channel_leaderboard.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | import pytz 3 | 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | from airflow.operators.empty import EmptyOperator 7 | from airflow.decorators import task 8 | 9 | from hooks.discord import send_alert_discord 10 | from hooks.pagerduty import send_alert_pagerduty 11 | 12 | default_args = { 13 | 'owner': 'karma3labs', 14 | 'retries': 5, 15 | 'retry_delay': timedelta(minutes=2), 16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 17 | } 18 | 19 | def _monday_9ampacific_in_utc_time(): 20 | pacific_tz = pytz.timezone('US/Pacific') 21 | pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00']) 22 | pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S')) 23 | utc_time = pacific_time.astimezone(pytz.utc) 24 | monday_utc_time = utc_time - timedelta(days=utc_time.weekday() - 0) 25 | return monday_utc_time 26 | 27 | with DAG( 28 | dag_id='notify_channel_leaderboard', 29 | default_args=default_args, 30 | description='channel notifications started by trigger dag or manually', 31 | start_date=datetime(2024, 7, 10, 18), 32 | schedule_interval=None, 33 | is_paused_upon_creation=True, 34 | max_active_runs=1, 35 | catchup=False, 36 | ) as dag: 37 | 38 | skip_notify = EmptyOperator(task_id="skip_notify") 39 | 40 | notify = BashOperator( 41 | task_id="notify", 42 | bash_command="cd /pipeline && ./run_notify_channel_leaderboard.sh -w . -v .venv -r ", 43 | dag=dag) 44 | 45 | @task.branch(task_id="check_last_successful") 46 | def check_last_successful(**context) -> bool: 47 | now = datetime.now(pytz.utc) 48 | prev_run_date = context['prev_data_interval_start_success'] 49 | weekly_run = _monday_9ampacific_in_utc_time() 50 | print(f"now: {now}, prev_run_date: {prev_run_date}, weekly_run: {weekly_run}") 51 | if ( 52 | now > weekly_run 53 | and (prev_run_date is None or prev_run_date < weekly_run) 54 | ): 55 | # Last successful run was before 9am on Monday, so we should run 56 | print(f"Last run {prev_run_date} was before {weekly_run}, so we should run") 57 | return "notify" 58 | return "skip_notify" 59 | 60 | check_last_successful = check_last_successful() 61 | 62 | check_last_successful >> skip_notify 63 | 64 | check_last_successful >> notify 65 | 66 | -------------------------------------------------------------------------------- /pipeline/dags/dag_notify_channel_weekly_mods.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import pytz 3 | 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | from airflow.operators.empty import EmptyOperator 7 | from airflow.decorators import task 8 | 9 | from hooks.discord import send_alert_discord 10 | from hooks.pagerduty import send_alert_pagerduty 11 | 12 | default_args = { 13 | 'owner': 'karma3labs', 14 | 'retries': 5, 15 | 'retry_delay': timedelta(minutes=2), 16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 17 | } 18 | 19 | def wed_9ampacific_in_utc_time(): 20 | wednesday_dow = 2 21 | pacific_tz = pytz.timezone('US/Pacific') 22 | pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00']) 23 | pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S')) 24 | utc_time = pacific_time.astimezone(pytz.utc) 25 | return utc_time - timedelta(days=utc_time.weekday() - wednesday_dow) 26 | 27 | with DAG( 28 | dag_id='notify_channel_weekly_mods', 29 | default_args=default_args, 30 | description='weekly notifications to mods', 31 | start_date=datetime(2024, 7, 10, 18), 32 | schedule_interval='30 16 * * 3', # every Wednesday at 16:30/17:30 UTC / 09:30 Pacific 33 | is_paused_upon_creation=True, 34 | max_active_runs=1, 35 | catchup=False, 36 | ) as dag: 37 | 38 | skip_notify = EmptyOperator(task_id="skip_notify") 39 | 40 | notify = BashOperator( 41 | task_id="notify", 42 | bash_command=( 43 | "cd /pipeline && ./run_notify_channel_weekly_mods.sh " 44 | " -w . -v .venv -b channels/Bot_Fids.csv -s '{{ prev_data_interval_end_success }}'"), 45 | dag=dag) 46 | 47 | @task.branch(task_id="check_last_successful") 48 | def check_last_successful(**context) -> bool: 49 | now = datetime.now(pytz.utc) 50 | prev_run_date = context['prev_data_interval_end_success'] 51 | weekly_run = wed_9ampacific_in_utc_time() 52 | print(f"now: {now}, prev_run_date: {prev_run_date}, weekly_run: {weekly_run}") 53 | if ( 54 | now > weekly_run 55 | and (prev_run_date is None or prev_run_date < weekly_run) 56 | ): 57 | # Last successful run was before today, so we should run 58 | print(f"Last run {prev_run_date} was before {weekly_run}, so we should run") 59 | return "notify" 60 | return "skip_notify" 61 | 62 | check_last_successful = check_last_successful() 63 | 64 | check_last_successful >> skip_notify 65 | 66 | check_last_successful >> notify 67 | 68 | -------------------------------------------------------------------------------- /pipeline/dags/dag_refresh_rank_view_v0.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from airflow.sensors.external_task import ExternalTaskSensor 6 | 7 | from hooks.discord import send_alert_discord 8 | from hooks.pagerduty import send_alert_pagerduty 9 | 10 | default_args = { 11 | 'owner': 'coder2j', 12 | 'retries': 5, 13 | 'retry_delay': timedelta(minutes=2), 14 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 15 | } 16 | 17 | with DAG( 18 | dag_id='refresh_rank_view_v0', 19 | default_args=default_args, 20 | description='This refreshes k3l_rank materialized view and vacuums k3l_rank table', 21 | start_date=datetime(2024, 7, 9, 18), 22 | # schedule_interval='0 1-23/6 * * *', 23 | schedule=None, 24 | catchup=False, 25 | ) as dag: 26 | 27 | task1 = BashOperator( 28 | task_id='refresh_view_k3l_rank_e8', 29 | bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . " 30 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank; " 31 | ''' 32 | ) 33 | 34 | task2 = BashOperator( 35 | task_id='vacuum_k3l_rank_e8', 36 | bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . " 37 | VACUUM ANALYZE k3l_rank; " 38 | ''' 39 | ) 40 | 41 | task1 >> task2 42 | -------------------------------------------------------------------------------- /pipeline/dags/dag_run_cast_pipeline_v0.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from airflow.decorators import task 6 | 7 | from hooks.discord import send_alert_discord 8 | from hooks.pagerduty import send_alert_pagerduty 9 | 10 | default_args = { 11 | 'owner': 'coder2j', 12 | 'retries': 5, 13 | 'retry_delay': timedelta(minutes=2), 14 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 15 | } 16 | 17 | with DAG( 18 | dag_id='run_cast_pipeline_v0', 19 | default_args=default_args, 20 | description='extract cast interactions and refresh pg statistics', 21 | start_date=datetime(2024, 7, 9, 18), 22 | # schedule_interval='*/10 * * * *', 23 | schedule_interval=timedelta(minutes=5), 24 | max_active_runs=1, 25 | is_paused_upon_creation=True, 26 | catchup=False, 27 | ) as dag: 28 | 29 | insert = BashOperator( 30 | task_id='insert_cast_actions', 31 | bash_command='cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/ ' 32 | ) 33 | 34 | insert8 = BashOperator( 35 | task_id='insert_cast_actions_e8', 36 | bash_command='cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/ -p eigen8 ' 37 | ) 38 | 39 | refresh = BashOperator( 40 | task_id='refresh_parent_casts_view', 41 | bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . " 42 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_parent_casts;" 43 | ''' 44 | ) 45 | 46 | refresh8 = BashOperator( 47 | task_id='refresh_parent_casts_view_e8', 48 | bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . " 49 | REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_parent_casts;" 50 | ''' 51 | ) 52 | 53 | @task.bash 54 | def gapfill_task(db: str) -> str: 55 | yesterday = datetime.now(timezone.utc) - timedelta(hours=25) 56 | return f"cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/"\ 57 | f" -f gapfill -p {db} -t '{yesterday.strftime('%Y-%m-%d %H:%M:%S')}'" 58 | 59 | gapfill = gapfill_task.override(task_id='gapfill_cast_actions')('eigen2') 60 | gapfill8 = gapfill_task.override(task_id='gapfill_cast_actions_e8')('eigen8') 61 | 62 | insert >> refresh >> gapfill 63 | insert8 >> refresh8 >> gapfill8 64 | -------------------------------------------------------------------------------- /pipeline/dags/dag_update_channel_points.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | from hooks.discord import send_alert_discord 7 | from hooks.pagerduty import send_alert_pagerduty 8 | 9 | default_args = { 10 | 'owner': 'karma3labs', 11 | 'retries': 5, 12 | 'retry_delay': timedelta(minutes=2), 13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 14 | } 15 | 16 | with DAG( 17 | dag_id='update_channel_points_v2', 18 | default_args=default_args, 19 | description='update channel points triggered by update_channel_tokens dag', 20 | start_date=datetime(2024, 7, 10, 18), 21 | schedule_interval='0 16 * * *', # every day at 17:00 UTC / 09:00 Pacific 22 | # schedule_interval=timedelta(days=1), 23 | # schedule=None, 24 | is_paused_upon_creation=True, 25 | max_active_runs=1, 26 | catchup=False, 27 | ) as dag: 28 | 29 | # run_genesis = BashOperator( 30 | # task_id="run_genesis", 31 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t genesis", 32 | # dag=dag) 33 | 34 | # daily_calc = BashOperator( 35 | # task_id="daily_calc", 36 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t compute", 37 | # dag=dag) 38 | 39 | # balance_update = BashOperator( 40 | # task_id="balance_update", 41 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t update", 42 | # dag=dag) 43 | 44 | # run_genesis8 = BashOperator( 45 | # task_id="run_genesis8", 46 | # bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t genesis -p eigen8", 47 | # dag=dag) 48 | 49 | daily_calc8 = BashOperator( 50 | task_id="daily_calc8", 51 | bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t compute -p eigen8", 52 | dag=dag) 53 | 54 | balance_update8 = BashOperator( 55 | task_id="balance_update8", 56 | bash_command="cd /pipeline && ./run_update_channel_points.sh -w . -v .venv -t update -p eigen8", 57 | dag=dag) 58 | 59 | backup_to_s3 = BashOperator( 60 | task_id='backup_channel_points_bal', 61 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh backup_channel_points_bal " 62 | ) 63 | 64 | # run_genesis >> daily_calc >> balance_update >> backup_to_s3 65 | # run_genesis8 >> daily_calc8 >> balance_update8 66 | daily_calc8 >> balance_update8 >> backup_to_s3 67 | -------------------------------------------------------------------------------- /pipeline/dags/extractors/dag_cura_mod.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | from hooks.discord import send_alert_discord 7 | from hooks.pagerduty import send_alert_pagerduty 8 | 9 | default_args = { 10 | "owner": "karma3labs", 11 | "retries": 1, 12 | "retry_delay": timedelta(minutes=5), 13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 14 | } 15 | 16 | with DAG( 17 | "extract_cura_mod", 18 | default_args=default_args, 19 | description="Fetch hidden fids from CURA API and load into DB daily", 20 | schedule_interval=timedelta(minutes=5), 21 | # schedule_interval=None, 22 | start_date=datetime(2024, 8, 1), 23 | is_paused_upon_creation=True, 24 | max_active_runs=1, 25 | catchup=False, 26 | ) as dag: 27 | 28 | fetch_task = BashOperator( 29 | task_id='extract_cura_hidden_fids', 30 | bash_command="cd /pipeline; extractors/extract_cura_mod.sh -w . -v .venv -r ", 31 | dag=dag 32 | ) 33 | 34 | fetch_task -------------------------------------------------------------------------------- /pipeline/dags/monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/dags/monitoring/__init__.py -------------------------------------------------------------------------------- /pipeline/dags/one_off/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/dags/one_off/.placeholder -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_gen_globaltrust_by_date_v0.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from hooks.discord import send_alert_discord 6 | 7 | 8 | default_args = { 9 | 'owner': 'coder2j', 10 | 'retries': 5, 11 | 'retry_delay': timedelta(minutes=2), 12 | # 'on_failure_callback': send_alert_discord, 13 | } 14 | 15 | # 2024-06-04 00:00 16 | # 875822 17 | # 2024-06-05 00:00 18 | # 875822 19 | # 2024-06-11 00:00 20 | # 921037 21 | # 2024-06-12 00:00 22 | # 921037 23 | # 2024-06-15 00:00 24 | # 960387 25 | # 2024-06-16 00:00 26 | # 960387 27 | with DAG( 28 | dag_id='one_off_gen_globaltrust_by_date_v0', 29 | default_args=default_args, 30 | description='This runs run_globaltrust_pipeline.sh without any optimization', 31 | schedule_interval=None, 32 | start_date=None, 33 | is_paused_upon_creation=True, 34 | max_active_runs=1, 35 | catchup=False, 36 | ) as dag: 37 | push_to_dune = BashOperator( 38 | task_id='push_to_dune', 39 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3 " 40 | ) 41 | 42 | task1 = BashOperator( 43 | task_id='06-05', 44 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-05" 45 | ) 46 | 47 | task2 = BashOperator( 48 | task_id='06-12', 49 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-12" 50 | ) 51 | 52 | task3 = BashOperator( 53 | task_id='06-16', 54 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-16" 55 | ) 56 | 57 | task5 = BashOperator( 58 | task_id='06-04', 59 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-04" 60 | ) 61 | 62 | task6 = BashOperator( 63 | task_id='06-11', 64 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-11" 65 | ) 66 | 67 | task7 = BashOperator( 68 | task_id='06-15', 69 | bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-15 " 70 | ) 71 | 72 | task1 >> task2 >> task3 >> push_to_dune >> task5 >> task6 >> task7 73 | 74 | -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_gen_globaltrust_by_date_v1.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.utils.trigger_rule import TriggerRule 5 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator 6 | from airflow.operators.bash import BashOperator 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | 12 | default_args = { 13 | 'owner': 'karma3labs', 14 | 'retries': 5, 15 | 'retry_delay': timedelta(minutes=2), 16 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 17 | } 18 | 19 | with DAG( 20 | dag_id='one_off_gen_globaltrust_by_date_v1', 21 | default_args=default_args, 22 | description='This runs run_globaltrust_pipeline.sh without any optimization', 23 | start_date=datetime(2024, 8, 16), 24 | schedule_interval=None, 25 | is_paused_upon_creation=True, 26 | max_active_runs=1, 27 | catchup=False, 28 | ) as dag: 29 | 30 | mkdir_tmp = BashOperator( 31 | task_id="mkdir_tmp", 32 | bash_command= "cd /pipeline; mkdir -p tmp/{{ run_id }}; mkdir -p tmp/graph_files", 33 | dag=dag) 34 | 35 | prep_globaltrust = BashOperator( 36 | task_id="prep_globaltrust", 37 | bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s prep" 38 | " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26", 39 | dag=dag) 40 | 41 | compute_engagement = BashOperator( 42 | task_id="compute_engagement", 43 | bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s compute_engagement" 44 | " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26", 45 | dag=dag) 46 | 47 | 48 | insert_db = BashOperator( 49 | task_id="insert_db", 50 | bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s insert_db" 51 | " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26", 52 | dag=dag) 53 | 54 | upload_to_dune = BashOperator( 55 | task_id="upload_to_dune", 56 | bash_command= "cd /pipeline/dags/pg_to_dune; ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3", 57 | dag=dag) 58 | 59 | trigger_refresh_views = TriggerDagRunOperator( 60 | task_id="trigger_refresh_views", 61 | trigger_dag_id="refresh_rank_view_v0", 62 | conf={"trigger": "gen_globaltrust_v1"}, 63 | ) 64 | 65 | # trigger_sync_sandbox = TriggerDagRunOperator( 66 | # task_id="trigger_sync_sandbox", 67 | # trigger_dag_id="sync_sandbox_globaltrust", 68 | # conf={"trigger": "gen_globaltrust_v1"}, 69 | # ) 70 | 71 | ( 72 | mkdir_tmp 73 | >> prep_globaltrust 74 | >> compute_engagement 75 | >> insert_db 76 | >> upload_to_dune 77 | >> trigger_refresh_views 78 | # >> trigger_sync_sandbox 79 | ) 80 | -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_insert_to_dune_table.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from hooks.discord import send_alert_discord 6 | from hooks.pagerduty import send_alert_pagerduty 7 | 8 | 9 | default_args = { 10 | 'owner': 'coder2j', 11 | 'retries': 5, 12 | 'retry_delay': timedelta(minutes=2), 13 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 14 | } 15 | 16 | 17 | with DAG( 18 | dag_id='one_off_insert_to_dune_tables', 19 | default_args=default_args, 20 | description='This inserts globaltrust and channel_ranking into dune', 21 | schedule_interval=None, 22 | start_date=None, 23 | is_paused_upon_creation=True, 24 | max_active_runs=1, 25 | catchup=False, 26 | ) as dag: 27 | task4 = BashOperator( 28 | task_id='overwrite_globaltrust_in_dune_v3', 29 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3" 30 | ) 31 | 32 | task5 = BashOperator( 33 | task_id='overwrite_channel_rank_in_dune_v3', 34 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_channel_rank_in_dune_v3" 35 | ) 36 | 37 | [task4, task5] 38 | 39 | -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_migrate_dune_table.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | 6 | 7 | default_args = { 8 | 'owner': 'coder2j', 9 | 'retries': 5, 10 | 'retry_delay': timedelta(minutes=2) 11 | } 12 | 13 | 14 | with DAG( 15 | dag_id='one_off_migrate_dune_table', 16 | default_args=default_args, 17 | description='This backs up globaltrust, localtrust and channel_ranking into s3', 18 | schedule_interval=None, 19 | start_date=None, 20 | is_paused_upon_creation=True, 21 | max_active_runs=1, 22 | catchup=False, 23 | ) as dag: 24 | task1 = BashOperator( 25 | task_id='create_dune_globaltrust_table', 26 | bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh create_dune_globaltrust_table dataset_k3l_cast_globaltrust_v2" 27 | ) 28 | 29 | [task1] 30 | 31 | -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_trial_branch.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import pytz 4 | import datetime 5 | from airflow import DAG 6 | from airflow.utils.trigger_rule import TriggerRule 7 | from airflow.operators.empty import EmptyOperator 8 | from airflow.operators.python import PythonOperator 9 | 10 | from airflow.decorators import task, task_group 11 | 12 | default_args = { 13 | 'owner': 'karma3labs', 14 | 'retries': 5, 15 | 'retry_delay': timedelta(minutes=2), 16 | } 17 | 18 | def _monday_9ampacific_in_utc_time(): 19 | pacific_tz = pytz.timezone('US/Pacific') 20 | pacific_9am_str = ' '.join([datetime.datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00']) 21 | pacific_time = pacific_tz.localize(datetime.datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S')) 22 | utc_time = pacific_time.astimezone(pytz.utc) 23 | monday = utc_time - timedelta(days=utc_time.weekday()) 24 | return monday 25 | 26 | with DAG( 27 | dag_id='one_off_trial_branch', 28 | default_args=default_args, 29 | description="One off dag to test new features", 30 | schedule_interval=None, 31 | start_date=None, 32 | is_paused_upon_creation=True, 33 | max_active_runs=1, 34 | catchup=False, 35 | ) as dag: 36 | 37 | @task.branch(task_id="branch") 38 | def branch_fn(**context): 39 | print(f"context: {context}") 40 | prev = context['prev_execution_date_success'] 41 | print(f"prev_execution_date_success: {prev}") 42 | if prev > _monday_9ampacific_in_utc_time(): 43 | return "t2" 44 | return "t1" 45 | 46 | def empty_fn(*args, **kwargs): 47 | pass 48 | 49 | branch = branch_fn() 50 | t1 = EmptyOperator(task_id="t1") 51 | t2 = EmptyOperator(task_id="t2") 52 | 53 | 54 | @task_group(group_id='all_group') 55 | def tg_all(): 56 | always = PythonOperator(task_id="always", 57 | python_callable=empty_fn, 58 | op_args=[], 59 | op_kwargs={}, 60 | trigger_rule=TriggerRule.ALL_SUCCESS) 61 | t3 = EmptyOperator(task_id="t3") 62 | 63 | always >> t3 64 | 65 | @task_group(group_id='some_group') 66 | def tg_some(): 67 | always = PythonOperator(task_id="always", 68 | python_callable=empty_fn, 69 | op_args=[], 70 | op_kwargs={}, 71 | trigger_rule=TriggerRule.ALL_SUCCESS) 72 | sometimes = EmptyOperator(task_id="sometimes") 73 | t3 = EmptyOperator(task_id="t3") 74 | 75 | always >> sometimes >> t3 76 | 77 | branch >> t1 >> tg_all() 78 | branch >> t2 >> tg_some() 79 | 80 | -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_trial_sql.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.empty import EmptyOperator 5 | from airflow.providers.common.sql.operators.sql import SQLCheckOperator 6 | 7 | default_args = { 8 | "owner": "karma3labs", 9 | "retries": 0, 10 | "retry_delay": timedelta(minutes=5), 11 | } 12 | 13 | _CONN_ID = "eig2_readonly_user" 14 | CHECK_QUERY = """ 15 | WITH 16 | channel_rank_stats AS ( 17 | SELECT 18 | COUNT(*) AS tot_rows, 19 | COUNT(DISTINCT channel_id) AS tot_channels, 20 | strategy_name 21 | FROM k3l_channel_rank 22 | GROUP BY strategy_name 23 | ), 24 | channel_fids_stats as ( 25 | SELECT 26 | COUNT(*) AS tot_rows, 27 | COUNT(DISTINCT channel_id) AS tot_channels, 28 | strategy_name 29 | -- TODO change table name to k3l_channel_fids 30 | FROM k3l_channel_rank 31 | GROUP BY strategy_name 32 | ) 33 | SELECT 34 | BOOL_AND( 35 | t2.tot_rows >= t1.tot_rows 36 | AND t2.tot_channels >= t1.tot_channels 37 | AND t2.strategy_name IS NOT NULL 38 | ) 39 | FROM channel_rank_stats as t1 40 | LEFT JOIN channel_fids_stats as t2 ON (t2.strategy_name = t1.strategy_name) 41 | """ 42 | 43 | with DAG( 44 | "one_off_trial_sql", 45 | default_args=default_args, 46 | description="One off dag to test new features", 47 | schedule_interval=None, 48 | start_date=None, 49 | is_paused_upon_creation=True, 50 | max_active_runs=1, 51 | catchup=False, 52 | ) as dag: 53 | 54 | start = EmptyOperator(task_id="start") 55 | 56 | sql_check = SQLCheckOperator( 57 | task_id='sql_check', 58 | sql=CHECK_QUERY, 59 | conn_id=_CONN_ID 60 | ) 61 | 62 | end = EmptyOperator(task_id="end") 63 | 64 | start >> sql_check >> end 65 | -------------------------------------------------------------------------------- /pipeline/dags/one_off/dag_trial_task_groups.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.empty import EmptyOperator 5 | from airflow.operators.bash import BashOperator 6 | from airflow.decorators import task_group 7 | 8 | default_args = { 9 | "owner": "karma3labs", 10 | "retries": 1, 11 | "retry_delay": timedelta(minutes=5), 12 | } 13 | 14 | with DAG( 15 | "one_off_trial_task_groups", 16 | default_args=default_args, 17 | description="One off dag to test new features", 18 | schedule_interval=None, 19 | start_date=None, 20 | is_paused_upon_creation=True, 21 | max_active_runs=1, 22 | catchup=False, 23 | ) as dag: 24 | 25 | @task_group(group_id='my_start_group') 26 | def tg_start(): 27 | start = EmptyOperator(task_id="start") 28 | 29 | echo1 = BashOperator( 30 | task_id="echo1", 31 | bash_command= "echo {{ (logical_date - macros.timedelta(days=90)) | ds }}", 32 | dag=dag 33 | ) 34 | 35 | echo2 = BashOperator( 36 | task_id="echo2", 37 | bash_command= "echo '{{ prev_data_interval_end_success }}'", 38 | dag=dag 39 | ) 40 | 41 | start >> echo1 >> echo2 42 | 43 | @task_group(group_id='my_echo_group') 44 | def tg_echo(): 45 | 46 | echo3 = BashOperator( 47 | task_id="echo3", 48 | bash_command= "echo {{ macros.ds_add(ds, -90) }}", 49 | dag=dag 50 | ) 51 | 52 | echo4 = BashOperator( 53 | task_id="echo4", 54 | bash_command= "echo {{ ds }}", 55 | dag=dag 56 | ) 57 | 58 | echo5 = BashOperator( 59 | task_id="echo5", 60 | bash_command= "echo {{ logical_date }}", 61 | dag=dag 62 | ) 63 | echo3 >> echo4 64 | echo5 65 | 66 | end = EmptyOperator(task_id="end") 67 | 68 | tg_start() >> tg_echo() >> end 69 | 70 | -------------------------------------------------------------------------------- /pipeline/dags/pg_to_dune/.env.sample: -------------------------------------------------------------------------------- 1 | DB_HOST=localhost 2 | DB_PORT=5432 3 | DB_NAME=farcaster 4 | DB_SSLMODE=allow 5 | DB_USERNAME=k3l_user 6 | DB_PASSWORD=changeme 7 | AWS_ACCESS_KEY_ID="changeme" 8 | AWS_SECRET_ACCESS_KEY="changeme" 9 | AWS_REGION="eu-central-1" 10 | GCP_TASK_ACCT="changeme" 11 | GCS_BUCKET_NAME="changeme" 12 | S3_BUCKET_NAME_CONSTANT="changeme" 13 | DUNE_API_KEY="changeme" -------------------------------------------------------------------------------- /pipeline/dags/pg_to_dune/app/check_last_timestamp.py: -------------------------------------------------------------------------------- 1 | 2 | import os, json 3 | from dune_client.types import QueryParameter 4 | from dune_client.client import DuneClient 5 | from dune_client.query import QueryBase 6 | 7 | # change the current working directory where .env file lives 8 | # os.chdir("/Users/abc/local-Workspace/python-notebook-examples") 9 | # load .env file 10 | # dotenv.load_dotenv(".env") 11 | # setup Dune Python client 12 | dune = DuneClient(os.environ["DUNE_API_KEY"]) 13 | 14 | query = QueryBase( 15 | name="fetch last date of globaltrust_v2", 16 | query_id=int(os.environ["QUERY_ID"]), 17 | ) 18 | 19 | result = dune.run_query( 20 | query = query, 21 | # performance = 'large' # optionally define which tier to run the execution on (default is "medium") 22 | ) 23 | 24 | if len(result.result.rows) != 1: 25 | raise "not one" 26 | 27 | last_date = result.result.rows[0][os.environ["FILTER_COLUMN"]] 28 | print(last_date) 29 | # # go over the results returned 30 | # for row in result.result.rows: 31 | # print('hell') 32 | # print (row) # as an example we print the rows 33 | -------------------------------------------------------------------------------- /pipeline/dags/reports/dag_gen_channel_metrics.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | # from airflow.operators.empty import EmptyOperator 5 | from airflow.operators.bash import BashOperator 6 | 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'karma3labs', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | 19 | with DAG( 20 | dag_id='report_gen_metrics', 21 | default_args=default_args, 22 | description='this generates channel metrics', 23 | start_date=datetime(2024, 8, 15), 24 | schedule_interval='0 */6 * * *', 25 | is_paused_upon_creation=True, 26 | max_active_runs=1, 27 | catchup=False, 28 | ) as dag: 29 | 30 | # gen_channel_metrics = EmptyOperator(task_id="gen_channel_metrics") 31 | 32 | gen_channel_metrics = BashOperator( 33 | task_id='gen_channel_metrics', 34 | bash_command='cd /pipeline/ && ./run_channel_metrics.sh -w . -v ./.venv/ -r ' 35 | ) -------------------------------------------------------------------------------- /pipeline/dags/reports/dag_gen_labels.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator 6 | 7 | 8 | from hooks.discord import send_alert_discord 9 | from hooks.pagerduty import send_alert_pagerduty 10 | 11 | default_args = { 12 | 'owner': 'karma3labs', 13 | 'retries': 5, 14 | 'retry_delay': timedelta(minutes=2), 15 | # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty], 16 | } 17 | 18 | 19 | with DAG( 20 | dag_id='report_gen_labels', 21 | default_args=default_args, 22 | description='This fetches spammers and save the list into s3', 23 | start_date=datetime(2024, 8, 15), 24 | schedule_interval='0 0 * * *', 25 | is_paused_upon_creation=True, 26 | max_active_runs=1, 27 | catchup=False, 28 | ) as dag: 29 | 30 | gen_top_spammers = BashOperator( 31 | task_id='gen_top_spammers', 32 | bash_command="cd /pipeline && ./run_fetch_top_spammers.sh -v ./.venv" 33 | ) 34 | 35 | gen_top_casters = BashOperator( 36 | task_id='gen_top_casters', 37 | bash_command="cd /pipeline && ./run_fetch_top_caster.sh -v ./.venv" 38 | ) 39 | 40 | trigger_sync_sandbox = TriggerDagRunOperator( 41 | task_id="trigger_sync_sandbox", 42 | trigger_dag_id="sync_sandbox_db_labels", 43 | conf={"trigger": "report_gen_labels"}, 44 | ) 45 | 46 | gen_top_spammers >> gen_top_casters >> trigger_sync_sandbox 47 | 48 | -------------------------------------------------------------------------------- /pipeline/dags/triggers/trigger_gen_channel_ranking_v3.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | from airflow.operators.empty import EmptyOperator 3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator 4 | from airflow.decorators import task, dag 5 | from airflow.models import DagRun 6 | from airflow.utils.state import DagRunState 7 | 8 | default_args = { 9 | 'owner': 'karma3labs', 10 | 'retries': 5, 11 | 'retry_delay': timedelta(minutes=2), 12 | } 13 | 14 | N_CHUNKS = 100 # Define the number of chunks 15 | FREQUENCY_H = 12 # Define the frequency in hours 16 | 17 | @dag( 18 | dag_id='trigger_gen_channel_ranking_v3', 19 | default_args=default_args, 20 | start_date=datetime(2024, 10, 1), 21 | schedule_interval=timedelta(hours=6), 22 | is_paused_upon_creation=True, 23 | max_active_runs=1, 24 | catchup=False # To avoid backfilling if not required 25 | ) 26 | def create_trigger_dag(): 27 | skip_main_dag = EmptyOperator(task_id="skip_main_dag") 28 | 29 | trigger_main_dag = TriggerDagRunOperator( 30 | task_id='trigger_main_dag', 31 | trigger_dag_id='gen_channel_ranking_v3', 32 | execution_date='{{ macros.datetime.now() }}', 33 | conf={"trigger": "trigger_gen_channel_ranking_v3"}, 34 | ) 35 | 36 | @task.branch(task_id="check_last_successful_run") 37 | def check_last_successful_run(**context) -> bool: 38 | dag_runs = DagRun.find(dag_id="gen_channel_ranking_v3", state=DagRunState.SUCCESS) 39 | if not dag_runs or len(dag_runs) == 0: 40 | # No previous runs 41 | print("No previous runs") 42 | return "trigger_main_dag" 43 | print(f"Found {len(dag_runs)} previous runs") 44 | dag_runs.sort(key=lambda x: x.execution_date, reverse=True) 45 | print("Last run: ", dag_runs[0]) 46 | # Query the last successful DAG run 47 | last_run = dag_runs[0] 48 | print("Last run: ", last_run) 49 | current_time = datetime.now(timezone.utc) 50 | delta = FREQUENCY_H 51 | if last_run: 52 | print("Last run end_date: ", last_run.end_date) 53 | print("Last run start_date: ", last_run.start_date) 54 | if last_run.end_date: 55 | delta_last = (current_time - last_run.end_date).total_seconds() / 3600 56 | delta = min(delta_last, delta) 57 | if last_run.start_date: 58 | delta_last = (current_time - last_run.start_date).total_seconds() / 3600 59 | delta = min(delta_last, delta) 60 | print(f"Delta: {delta}") 61 | if delta >= FREQUENCY_H: 62 | # Last run was more than FREQUENCY_H hours ago, so we should run 63 | print(f"Last run was more than {FREQUENCY_H} hours ago, so we should run") 64 | return "trigger_main_dag" 65 | return "skip_main_dag" 66 | 67 | check_last_successful_run = check_last_successful_run() 68 | 69 | check_last_successful_run >> trigger_main_dag 70 | 71 | check_last_successful_run >> skip_main_dag 72 | 73 | trigger_dag = create_trigger_dag() 74 | 75 | -------------------------------------------------------------------------------- /pipeline/dags/triggers/trigger_gen_channel_ranking_v4.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | from airflow.operators.empty import EmptyOperator 3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator 4 | from airflow.decorators import task, dag 5 | from airflow.models import DagRun 6 | from airflow.utils.state import DagRunState 7 | 8 | default_args = { 9 | 'owner': 'karma3labs', 10 | 'retries': 5, 11 | 'retry_delay': timedelta(minutes=2), 12 | } 13 | 14 | N_CHUNKS = 100 # Define the number of chunks 15 | FREQUENCY_H = 24 # Define the frequency in hours 16 | 17 | @dag( 18 | dag_id='trigger_gen_channel_ranking_v4', 19 | default_args=default_args, 20 | start_date=datetime(2024, 10, 1), 21 | schedule_interval=timedelta(hours=24), 22 | is_paused_upon_creation=True, 23 | max_active_runs=1, 24 | catchup=False # To avoid backfilling if not required 25 | ) 26 | def create_trigger_dag(): 27 | skip_main_dag = EmptyOperator(task_id="skip_main_dag") 28 | 29 | trigger_main_dag = TriggerDagRunOperator( 30 | task_id='trigger_main_dag', 31 | trigger_dag_id='gen_channel_ranking_v4', 32 | execution_date='{{ macros.datetime.now() }}', 33 | conf={"trigger": "trigger_gen_channel_ranking_v4"}, 34 | ) 35 | 36 | @task.branch(task_id="check_last_successful_run") 37 | def check_last_successful_run(**context) -> bool: 38 | dag_runs = DagRun.find(dag_id="gen_channel_ranking_v4", state=DagRunState.SUCCESS) 39 | if not dag_runs or len(dag_runs) == 0: 40 | # No previous runs 41 | print("No previous runs") 42 | return "trigger_main_dag" 43 | print(f"Found {len(dag_runs)} previous runs") 44 | dag_runs.sort(key=lambda x: x.execution_date, reverse=True) 45 | print("Last run: ", dag_runs[0]) 46 | # Query the last successful DAG run 47 | last_run = dag_runs[0] 48 | print("Last run: ", last_run) 49 | current_time = datetime.now(timezone.utc) 50 | delta = FREQUENCY_H 51 | if last_run: 52 | print("Last run end_date: ", last_run.end_date) 53 | print("Last run start_date: ", last_run.start_date) 54 | if last_run.end_date: 55 | delta_last = (current_time - last_run.end_date).total_seconds() / 3600 56 | delta = min(delta_last, delta) 57 | if last_run.start_date: 58 | delta_last = (current_time - last_run.start_date).total_seconds() / 3600 59 | delta = min(delta_last, delta) 60 | print(f"Delta: {delta}") 61 | if delta >= FREQUENCY_H: 62 | # Last run was more than FREQUENCY_H hours ago, so we should run 63 | print(f"Last run was more than {FREQUENCY_H} hours ago, so we should run") 64 | return "trigger_main_dag" 65 | return "skip_main_dag" 66 | 67 | check_last_successful_run = check_last_successful_run() 68 | 69 | check_last_successful_run >> trigger_main_dag 70 | 71 | check_last_successful_run >> skip_main_dag 72 | 73 | trigger_dag = create_trigger_dag() 74 | 75 | -------------------------------------------------------------------------------- /pipeline/extractors/automod_extractor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import date 3 | import requests 4 | from sqlalchemy import create_engine 5 | from sqlalchemy import text 6 | import io 7 | from loguru import logger 8 | import sys 9 | 10 | 11 | def fetch_data_from_api(api_key, db_user, db_password, db_endpoint): 12 | params = {'start': '2024-01-01', 'end': '2024-12-31'} 13 | headers = {'api-key': f"{api_key}"} 14 | df_automod = pd.DataFrame() 15 | for channel in ["degen", "dev", "memes"]: 16 | initial_url = f"https://automod.sh/api/partners/channels/{channel}/activity/export?" 17 | response = requests.get(initial_url, params=params, headers=headers) 18 | print(response.url) 19 | if response.status_code == 200: 20 | # Read the response content into a pandas DataFrame 21 | data = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) 22 | data["channel_id"] = channel 23 | print(len(data)) 24 | df_automod = pd.concat([df_automod, data], axis=0) 25 | else: 26 | raise Exception(f"Failed to fetch data from automod. Status code: {response.status_code}") 27 | 28 | if len(df_automod) == 0: 29 | raise Exception("Failed to fetch data from automod. No data found.") 30 | 31 | rename_dict = { 32 | 'createdAt': 'created_at', 33 | 'affectedUsername': 'affected_username', 34 | 'affectedUserFid': 'affected_userid', 35 | 'castHash': 'cast_hash', 36 | 'castText': 'cast_text' 37 | } 38 | 39 | df_automod.rename(columns=rename_dict, inplace=True) 40 | df_automod = df_automod[ 41 | ["created_at", "action", "actor", "affected_username", "affected_userid", "cast_hash", "channel_id"]] 42 | df_automod['created_at'] = pd.to_datetime(df_automod['created_at'], unit='ms') 43 | df_automod["date_iso"] = date.today() 44 | 45 | logger.info(df_automod.head()) 46 | engine_string = "postgresql+psycopg2://%s:%s@%s:%d/%s" \ 47 | % (db_user, db_password, db_endpoint, 9541, 'farcaster') 48 | 49 | postgres_engine = create_engine(engine_string, connect_args={"connect_timeout": 1000}) 50 | with postgres_engine.begin() as conn: 51 | conn.execute(text("TRUNCATE TABLE automod_data")) 52 | df_automod.to_sql('automod_data', con=conn, if_exists='append', index=False) 53 | return None 54 | 55 | 56 | if __name__ == "__main__": 57 | # Get the parameters from the command line arguments 58 | if len(sys.argv) != 5: 59 | raise ValueError("Please provide db_user, db_password, and db_endpoint as arguments.") 60 | 61 | api_key = sys.argv[1] 62 | db_user = sys.argv[2] 63 | db_password = sys.argv[3] 64 | db_endpoint = sys.argv[4] 65 | 66 | fetch_data_from_api(api_key, db_user, db_password, db_endpoint) -------------------------------------------------------------------------------- /pipeline/extractors/extract_channel_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts w:v:c:s:d flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | esac 9 | done 10 | 11 | if [ -z "$VENV" ] || [ -z "$WORK_DIR" ]; then 12 | echo "Usage: $0 -w [work_dir] -v [venv] " 13 | echo "" 14 | echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv " 15 | echo "" 16 | echo "Params:" 17 | echo " [work_dir] The working directory to read .env file and execute scripts from." 18 | echo " [venv] The path where a python3 virtualenv has been created." 19 | echo "" 20 | exit 21 | fi 22 | 23 | # Setup environment variables 24 | echo "Setting up environment variables" 25 | source $WORK_DIR/.env 26 | 27 | # Activate 28 | echo "Activating Python 3.12 environment" 29 | source $VENV/bin/activate 30 | 31 | # Install 32 | echo "Installing requirements" 33 | #pip install -r requirements.txt 34 | 35 | # Run 36 | echo "Running channel data import" 37 | /usr/bin/env python3 -m extractors.main_channel_data 38 | 39 | if [ $? -ne 0 ]; then 40 | echo "Failed to run script" 41 | exit 1 42 | fi 43 | 44 | # Teardown 45 | echo "Deactivating Python 3.12 environment" 46 | deactivate 47 | -------------------------------------------------------------------------------- /pipeline/extractors/extract_cura_mod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts w:v:rd flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | r) RUN_FLAG="--run";; 9 | d) DRYRUN_FLAG="--dry-run";; 10 | esac 11 | done 12 | 13 | if [ -z "$VENV" ] || [ -z "$WORK_DIR" ] || [ -z "$RUN_FLAG" ]; then 14 | echo "Usage: $0 -w [work_dir] -v [venv] -r -d " 15 | echo "" 16 | echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv -r" 17 | echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv -r -d" 18 | echo "" 19 | echo "Params:" 20 | echo " [work_dir] The working directory to read .env file and execute scripts from." 21 | echo " [venv] The path where a python3 virtualenv has been created." 22 | echo " [run] Flag to run the script." 23 | echo " [dryrun] Flag to run the script in dry-run mode." 24 | echo "" 25 | exit 26 | fi 27 | 28 | set -e 29 | set -o pipefail 30 | 31 | # Setup environment variables 32 | echo "Setting up environment variables" 33 | source $WORK_DIR/.env 34 | 35 | # Activate 36 | echo "Activating Python 3.12 environment" 37 | source $VENV/bin/activate 38 | 39 | # Install 40 | echo "Installing requirements" 41 | #pip install -r requirements.txt 42 | 43 | # Run 44 | echo "Running cura channel mod data extractor with flags" 45 | /usr/bin/env python3 -m extractors.cura_mod_extractor $RUN_FLAG $DRYRUN_FLAG 46 | 47 | if [ $? -ne 0 ]; then 48 | echo "Failed to run script" 49 | exit 1 50 | fi 51 | 52 | # Teardown 53 | echo "Deactivating Python 3.12 environment" 54 | deactivate 55 | -------------------------------------------------------------------------------- /pipeline/extractors/main_channel_data.py: -------------------------------------------------------------------------------- 1 | from config import settings 2 | import utils 3 | 4 | import requests 5 | import pandas as pd 6 | from sqlalchemy import create_engine 7 | from sqlalchemy import text 8 | from loguru import logger 9 | 10 | 11 | def fetch_data_from_api(): 12 | initial_url = "https://api.warpcast.com/v2/all-channels" 13 | response = requests.get(initial_url) 14 | 15 | df_warpcast_channels = pd.DataFrame(response.json()["result"]["channels"]) 16 | df_warpcast_channels['createdAt'] = pd.to_datetime(df_warpcast_channels['createdAt'], unit='ms') 17 | df_warpcast_channels.columns = df_warpcast_channels.columns.str.lower() 18 | db_column_names = [ 19 | "id", 20 | "url", 21 | "name", 22 | "description", 23 | "imageurl", 24 | "headerimageurl", 25 | "leadfid", 26 | "moderatorfids", 27 | "createdat", 28 | "followercount", 29 | "membercount", 30 | "pinnedcasthash", 31 | ] 32 | df_warpcast_channels = df_warpcast_channels.filter(items=db_column_names, axis=1) 33 | logger.info(utils.df_info_to_string(df_warpcast_channels, with_sample=True)) 34 | 35 | if len(df_warpcast_channels) == 0: 36 | raise Exception("Failed to fetch data from warpcast. No data found.") 37 | 38 | postgres_engine = create_engine(settings.POSTGRES_URL.get_secret_value(), connect_args={"connect_timeout": 1000}) 39 | try: 40 | with postgres_engine.begin() as conn: 41 | conn.execute(text("TRUNCATE TABLE warpcast_channels_data")) 42 | df_warpcast_channels.to_sql('warpcast_channels_data', con=conn, if_exists='append', index=False) 43 | except Exception as e: 44 | logger.error(f"Failed to insert data into postgres: {e}") 45 | raise e 46 | 47 | 48 | if __name__ == "__main__": 49 | fetch_data_from_api() 50 | -------------------------------------------------------------------------------- /pipeline/frames/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/frames/__init__.py -------------------------------------------------------------------------------- /pipeline/frames/frames_db_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from timer import Timer 4 | 5 | import psycopg2 6 | import psycopg2.extras 7 | 8 | 9 | @Timer(name="fetch_unprocessed_urls") 10 | def fetch_unprocessed_urls(logger: logging.Logger, pg_dsn: str, limit: int) -> list[tuple]: 11 | """return will be of the form [(url_id, url)]""" 12 | fetch_sql = f""" 13 | SELECT url_id, url 14 | FROM k3l_url_labels 15 | WHERE processed_ts IS NULL 16 | ORDER BY earliest_cast_dt ASC 17 | LIMIT {limit} 18 | """ 19 | with psycopg2.connect(pg_dsn) as conn: 20 | with conn.cursor() as cursor: 21 | logger.info(f"Executing: {fetch_sql}") 22 | cursor.execute(fetch_sql) 23 | url_records = cursor.fetchall() 24 | return url_records 25 | 26 | @Timer(name="update_url_categories") 27 | def update_url_categories(logger: logging.Logger, pg_dsn: str, url_categories: list[tuple]): 28 | """url_categories should be of the form [(url_id, category)]""" 29 | update_sql = """ 30 | UPDATE k3l_url_labels as k 31 | SET processed_ts=now(), category=v.cat 32 | FROM (VALUES %s) AS v(id, cat) 33 | WHERE url_id=v.id; 34 | """ 35 | with psycopg2.connect(pg_dsn) as conn: 36 | with conn.cursor() as cursor: 37 | logger.info(f"Executing: {update_sql}") 38 | psycopg2.extras.execute_values(cursor, 39 | update_sql, 40 | url_categories, 41 | template=None, 42 | page_size=100) 43 | 44 | @Timer(name="fetch_unparsed_urls") 45 | def fetch_unparsed_urls(logger: logging.Logger, pg_dsn: str, limit: int) -> list[tuple]: 46 | """return will be of the form [(url_id, url)]""" 47 | fetch_sql = f""" 48 | SELECT url_id, url 49 | FROM k3l_url_labels 50 | WHERE parsed_ts IS NULL 51 | ORDER BY earliest_cast_dt ASC 52 | LIMIT {limit} 53 | """ 54 | with psycopg2.connect(pg_dsn) as conn: 55 | with conn.cursor() as cursor: 56 | logger.info(f"Executing: {fetch_sql}") 57 | cursor.execute(fetch_sql) 58 | url_records = cursor.fetchall() 59 | return url_records 60 | 61 | @Timer(name="update_url_parts") 62 | def update_url_parts(logger: logging.Logger, pg_dsn: str, url_parts: list[tuple]): 63 | """url_parts should be of the form [(url_id, scheme, domain, subdomain, tld, path)]""" 64 | update_sql = f""" 65 | UPDATE k3l_url_labels as k 66 | SET parsed_ts=now(), scheme=v.scheme, domain=v.domain, subdomain=v.subdomain, tld=v.tld, path=v.path 67 | FROM (VALUES %s) AS v(id, scheme, domain, subdomain, tld, path) 68 | WHERE url_id=v.id; 69 | """ 70 | with psycopg2.connect(pg_dsn) as conn: 71 | with conn.cursor() as cursor: 72 | logger.info(f"Executing: {update_sql}") 73 | psycopg2.extras.execute_values(cursor, 74 | update_sql, 75 | url_parts, 76 | template=None, 77 | page_size=100) 78 | 79 | -------------------------------------------------------------------------------- /pipeline/frames/incremental_load_cast_mapping.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO k3l_cast_embed_url_mapping(url_id, cast_id) 2 | WITH max_cast_dt AS ( 3 | select 4 | max(latest_cast_dt) as dt 5 | from k3l_url_labels as labels 6 | inner join k3l_cast_embed_url_mapping as url_map on (labels.url_id = url_map.url_id) 7 | ) 8 | SELECT 9 | labels.url_id as url_id, 10 | casts.id as cast_id 11 | FROM casts 12 | cross join lateral jsonb_array_elements(casts.embeds) as ems 13 | inner join max_cast_dt on (casts.created_at >= max_cast_dt.dt AND casts.deleted_at IS NULL) 14 | inner join 15 | k3l_url_labels as labels 16 | on (labels.url = ems->>'url' 17 | AND jsonb_array_length(embeds) > 0 18 | AND ems->'url' IS NOT NULL 19 | AND ems->>'url' NOT LIKE ALL(ARRAY[ 20 | 'https://i.imgur.com/%', 21 | 'https://youtu.be/%', 22 | 'https://www.youtube.com/%', 23 | 'https://imagedelivery.net/%', 24 | '%.png', '%.gif', '%.pdf', '%.jpg', '%.jpeg', '%.mp4', '%.m3u8']) 25 | AND created_at >= max_cast_dt.dt 26 | ) -------------------------------------------------------------------------------- /pipeline/frames/incremental_load_labels.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO k3l_url_labels(url, latest_cast_dt, earliest_cast_dt) 2 | WITH max_cast_dt AS ( 3 | select 4 | max(latest_cast_dt) as dt 5 | from k3l_url_labels as labels 6 | inner join k3l_cast_embed_url_mapping as url_map on (labels.url_id = url_map.url_id) 7 | ) 8 | SELECT 9 | ems->>'url' as url, 10 | max(created_at) as latest_cast_dt, 11 | min(created_at) as earliest_cast_dt 12 | FROM 13 | casts 14 | cross join lateral jsonb_array_elements(casts.embeds) as ems 15 | inner join max_cast_dt on (casts.created_at >= max_cast_dt.dt AND casts.deleted_at IS NULL) 16 | left join 17 | k3l_url_labels as labels 18 | on (labels.url = ems->>'url' 19 | and casts.created_at >= max_cast_dt.dt 20 | ) 21 | WHERE 22 | labels.url_id IS NULL 23 | AND jsonb_array_length(embeds) > 0 24 | AND ems->'url' IS NOT NULL 25 | AND ems->>'url' NOT LIKE ALL(ARRAY[ 26 | 'https://i.imgur.com/%', 27 | 'https://youtu.be/%', 28 | 'https://www.youtube.com/%', 29 | 'https://imagedelivery.net/%', 30 | '%.png', '%.gif', '%.pdf', '%.jpg', '%.jpeg', '%.mp4', '%.m3u8']) 31 | GROUP BY ems->>'url' -------------------------------------------------------------------------------- /pipeline/frames/scrape_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from enum import Enum 3 | from typing import NamedTuple 4 | import asyncio 5 | from urllib.parse import urlparse 6 | 7 | import tldextract 8 | from bs4 import BeautifulSoup 9 | import aiohttp as aiohttp 10 | 11 | class URLCategory(Enum): 12 | FRAME = 'frame' 13 | TIMEOUT = 'timeout' 14 | BAD = 'bad' 15 | UNKNOWN = 'unknown' 16 | ERROR = 'error' 17 | 18 | async def categorize_url( 19 | logger: logging.Logger, 20 | url_id: int, url:str, 21 | session: aiohttp.ClientSession, 22 | timeout: aiohttp.ClientTimeout 23 | ) -> tuple[int, str]: 24 | logger.debug(f"Fetching {url_id} - {url}") 25 | try: 26 | if urlparse(url).scheme not in ['http','https']: 27 | logger.error(f"bad url {url_id} - {url}") 28 | return (url_id, URLCategory.BAD.value) 29 | async with session.get(url, timeout=timeout) as resp: 30 | body = await resp.text() 31 | soup = BeautifulSoup(body, 'html.parser') 32 | frame_meta = soup.find('meta', attrs={"property":"fc:frame"}) 33 | return (url_id, URLCategory.FRAME.value) if frame_meta \ 34 | else (url_id, URLCategory.UNKNOWN.value) 35 | except asyncio.TimeoutError as e: 36 | logger.error(f"{url_id} - {url} timed out: {e}") 37 | return (url_id, URLCategory.TIMEOUT.value) 38 | except aiohttp.InvalidURL as e: 39 | logger.error(f"bad url {url_id} - {url}: {e}") 40 | return (url_id, URLCategory.BAD.value) 41 | except aiohttp.ClientError as e: 42 | logger.error(f"error {url_id} - {url}: {e}") 43 | return (url_id, URLCategory.ERROR.value) 44 | except aiohttp.ClientError as e: 45 | logger.error(f"error {url_id} - {url}: {e}") 46 | return (url_id, URLCategory.ERROR.value) 47 | except ValueError as e: 48 | logger.error(f"error {url_id} - {url}: {e}") 49 | return (url_id, URLCategory.ERROR.value) 50 | except Exception as e: 51 | logger.error(f"error {url_id} - {url}: {e}") 52 | return (url_id, URLCategory.ERROR.value) 53 | 54 | class URL_parts(NamedTuple): 55 | url_id: int 56 | scheme: str 57 | domain: str 58 | subdomain: str 59 | tld: str 60 | path: str 61 | 62 | def parse_url( 63 | logger: logging.Logger, 64 | url_id: int, 65 | url:str 66 | ) -> tuple[int, str, str, str, str, str]: 67 | logger.debug(f"parsing {url_id} - {url}") 68 | try: 69 | parse_result = urlparse(url) 70 | extract = tldextract.extract(url) 71 | path = parse_result.path 72 | if path.endswith(':'): 73 | path = path[:-1] 74 | return tuple(URL_parts(url_id, 75 | parse_result.scheme, 76 | extract.domain, 77 | extract.subdomain, 78 | extract.suffix, 79 | path)) 80 | except Exception as e: 81 | logger.error(f"error {url_id} - {url}: {e}") 82 | return (url_id, '', '', '', '', '') -------------------------------------------------------------------------------- /pipeline/frames/test_urls.py: -------------------------------------------------------------------------------- 1 | # standard dependencies 2 | import sys 3 | 4 | # local dependencies 5 | from config import settings 6 | from . import scrape_utils 7 | 8 | # 3rd party dependencies 9 | from dotenv import load_dotenv 10 | from loguru import logger 11 | 12 | logger.remove() 13 | level_per_module = { 14 | "": settings.LOG_LEVEL, 15 | "silentlib": False 16 | } 17 | logger.add(sys.stdout, 18 | colorize=True, 19 | format=settings.LOGURU_FORMAT, 20 | filter=level_per_module, 21 | level=0) 22 | 23 | def test(): 24 | url = 'https://apis.cast.k3l.io' 25 | url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1) 26 | logger.debug(f"{url} category ? {url_category}") 27 | 28 | url = 'https://cast.k3l.io/apis123' 29 | url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1) 30 | logger.debug(f"{url} category ? {url_category}") 31 | 32 | url = 'https://cast.k3l.io' 33 | url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1) 34 | logger.debug(f"{url} category ? {url_category}") 35 | 36 | url = 'https://dune-frames.vercel.app/api' 37 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS) 38 | logger.debug(f"{url} category ? {url_category}") 39 | 40 | url = 'https://www.youtube.com' 41 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS) 42 | logger.debug(f"{url} category ? {url_category}") 43 | 44 | url = 'https://www.youttube.com' 45 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS) 46 | logger.debug(f"{url} category ? {url_category}") 47 | 48 | url = 'abc' 49 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS) 50 | logger.debug(f"{url} category ? {url_category}") 51 | 52 | url = 'http://1' 53 | url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS) 54 | logger.debug(f"{url} category ? {url_category}") 55 | 56 | 57 | if __name__ == "__main__": 58 | load_dotenv() 59 | print(settings) 60 | 61 | logger.debug('####### TODO use pytest ########') 62 | test() 63 | -------------------------------------------------------------------------------- /pipeline/globaltrust/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/globaltrust/__init__.py -------------------------------------------------------------------------------- /pipeline/globaltrust/export_localtrust_daily_stats.sql: -------------------------------------------------------------------------------- 1 | with stats_per_strategy_per_date as (SELECT 2 | max(date) AS date, 3 | COUNT(CASE WHEN strategy_id = 1 THEN 1 END) AS strategy_id_1_row_count, 4 | AVG(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_mean, 5 | STDDEV(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_stddev, 6 | MAX(CASE WHEN strategy_id = 1 THEN v END) - MIN(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_range, 7 | COUNT(CASE WHEN strategy_id = 3 THEN 1 END) AS strategy_id_3_row_count, 8 | AVG(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_mean, 9 | STDDEV(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_stddev, 10 | MAX(CASE WHEN strategy_id = 3 THEN v END) - MIN(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_range 11 | FROM 12 | localtrust 13 | -- GROUP BY 14 | -- date 15 | ) 16 | 17 | INSERT INTO localtrust_stats ( 18 | date, 19 | strategy_id_1_row_count, 20 | strategy_id_1_mean, 21 | strategy_id_1_stddev, 22 | strategy_id_1_range, 23 | strategy_id_3_row_count, 24 | strategy_id_3_mean, 25 | strategy_id_3_stddev, 26 | strategy_id_3_range 27 | ) 28 | SELECT 29 | date, 30 | strategy_id_1_row_count, 31 | strategy_id_1_mean, 32 | strategy_id_1_stddev, 33 | strategy_id_1_range, 34 | strategy_id_3_row_count, 35 | strategy_id_3_mean, 36 | strategy_id_3_stddev, 37 | strategy_id_3_range 38 | FROM 39 | stats_per_strategy_per_date; 40 | -------------------------------------------------------------------------------- /pipeline/globaltrust/queries.py: -------------------------------------------------------------------------------- 1 | from db_utils import SQL 2 | 3 | class IJVSql: 4 | LIKES = SQL("LIKES", """ 5 | SELECT reactions.fid as i, reactions.target_fid as j, count(1) as likes_v 6 | FROM reactions 7 | INNER JOIN fids ON fids.fid = reactions.target_fid 8 | WHERE reaction_type=1 9 | AND reactions.target_fid IS NOT NULL 10 | {condition} 11 | GROUP BY i, j 12 | """) 13 | REPLIES = SQL("REPLIES", """ 14 | SELECT fid as i, parent_fid as j, count(1) as replies_v 15 | FROM casts 16 | WHERE parent_hash IS NOT NULL 17 | {condition} 18 | GROUP by i, j 19 | """) 20 | MENTIONS = SQL("MENTIONS", """ 21 | WITH mention AS ( 22 | SELECT fid as author_fid, mention as mention_fid, timestamp 23 | FROM casts, unnest(casts.mentions) as mention 24 | ) 25 | SELECT 26 | author_fid as i, mention_fid as j, count(1) as mentions_v 27 | FROM mention 28 | INNER JOIN fids ON fids.fid = mention.mention_fid 29 | {condition} 30 | GROUP BY i, j 31 | """) 32 | RECASTS = SQL("RECASTS", """ 33 | SELECT reactions.fid as i, reactions.target_fid as j, count(1) as recasts_v 34 | FROM reactions 35 | INNER JOIN fids ON fids.fid = reactions.target_fid 36 | WHERE reaction_type=2 37 | AND reactions.target_fid IS NOT NULL 38 | {condition} 39 | GROUP BY i, j 40 | """) 41 | FOLLOWS = SQL("FOLLOWS", """ 42 | SELECT 43 | links.fid as i, 44 | links.target_fid as j, 45 | 1 as follows_v 46 | FROM links 47 | INNER JOIN fids ON fids.fid = links.target_fid 48 | WHERE type = 'follow'::text 49 | {condition} 50 | ORDER BY i, j, follows_v desc 51 | """) 52 | 53 | class IVSql: 54 | PRETRUST_TOP_TIER = SQL("PRETRUST_TOP_TIER", """ 55 | WITH pt_size AS ( 56 | select count(*) as ct from pretrust_v2 57 | where insert_ts=(select max(insert_ts) from pretrust_v2 where strategy_id = {strategy}) 58 | and strategy_id = {strategy} 59 | ) 60 | SELECT fid as i, 1/ct::numeric as v 61 | FROM pretrust_v2, pt_size 62 | WHERE insert_ts=(select max(insert_ts) from pretrust_v2 where strategy_id = {strategy}) 63 | AND strategy_id = {strategy} 64 | """) 65 | PRETRUST_POPULAR = SQL("PRETRUST_POPULAR", """ 66 | SELECT 67 | c.fid AS i, 68 | 1/20::numeric as v 69 | FROM 70 | reactions r 71 | INNER JOIN casts c ON c.hash = r.target_cast_hash 72 | INNER JOIN user_data u ON c.fid = u.fid AND u.type = 6 73 | WHERE 74 | r.created_at >= current_timestamp - interval '7' day 75 | GROUP BY 76 | c.fid 77 | ORDER BY 78 | COUNT(*) DESC 79 | LIMIT 20 80 | """) 81 | PRETRUST_OG = SQL("PRETRUST_OG", """ 82 | SELECT 83 | distinct fid as i, 84 | 1/11::numeric as v 85 | FROM user_data 86 | WHERE 87 | value in ('dwr.eth', 'varunsrin.eth', 'balajis.eth', 88 | 'vitalik.eth','ccarella.eth','tim', 89 | 'lesgreys.eth','linda','ace', 90 | 'vm','cdixon.eth') 91 | AND type=6 92 | """) -------------------------------------------------------------------------------- /pipeline/globaltrust/test_data.py: -------------------------------------------------------------------------------- 1 | # standard dependencies 2 | import logging 3 | 4 | # local dependencies 5 | import utils 6 | from config import settings 7 | from . import compute 8 | from .queries import IJVSql 9 | 10 | # 3rd party dependencies 11 | from dotenv import load_dotenv 12 | import pandas as pd 13 | 14 | if __name__ == '__main__': 15 | load_dotenv() 16 | print(settings) 17 | 18 | logger = logging.getLogger() 19 | utils.setup_filelogger(logger, __file__) 20 | logger.setLevel(logging.DEBUG) 21 | utils.setup_consolelogger(logger) 22 | 23 | pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value() 24 | 25 | df = compute._fetch_interactions_df(logger, pg_dsn) 26 | logger.info(utils.df_info_to_string(df, with_sample=True)) 27 | 28 | pkl_file = '/tmp/fc_interactions_df.pkl' 29 | logger.info(f"Pickling interactions dataframe to {pkl_file}") 30 | df.to_pickle(pkl_file) 31 | logger.info(f"Done pickling interactions dataframe to {pkl_file}") 32 | 33 | num_ij_pairs = df[df['follows_v'].notna()].groupby(['i', 'j']).ngroups 34 | logger.info(f"Unique i,j follow pairs: {num_ij_pairs}") 35 | 36 | num_selfies = len(df[df['i']==df['j']]) 37 | logger.info(f"Number of self followers: {num_selfies}") 38 | -------------------------------------------------------------------------------- /pipeline/graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/graph/__init__.py -------------------------------------------------------------------------------- /pipeline/graph/export_existingConnections_addr.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | '0x'||encode(coalesce(v1.signer_address, f1.custody_address),'hex') as i, 3 | '0x'||encode(coalesce(v2.signer_address, f2.custody_address),'hex') as j, 4 | lt.v 5 | FROM localtrust as lt 6 | INNER JOIN fids as f1 on (f1.fid = cast(lt.i as int8)) 7 | INNER JOIN fids as f2 on (f2.fid = cast(lt.j as int8)) 8 | LEFT JOIN verifications as v1 on (v1.fid = f1.fid) 9 | LEFT JOIN verifications as v2 on (v2.fid = f2.fid) 10 | WHERE 11 | lt.strategy_id=1 12 | AND lt.date=(select max(date) from localtrust where strategy_id=1) 13 | 14 | -------------------------------------------------------------------------------- /pipeline/graph/export_existingConnections_fid.sql: -------------------------------------------------------------------------------- 1 | select 2 | i, 3 | j, 4 | v 5 | from 6 | localtrust 7 | where 8 | strategy_id=1 9 | and date=(select max(date) from localtrust where strategy_id=1) 10 | -- comment out below code for local testing 11 | -- AND i::integer < 10 12 | -- ORDER BY random() 13 | -- LIMIT 1000 -------------------------------------------------------------------------------- /pipeline/graph/export_l1rep6rec3m12enhancedConnections_addr.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | '0x'||encode(coalesce(v1.signer_address, f1.custody_address),'hex') as i, 3 | '0x'||encode(coalesce(v2.signer_address, f2.custody_address),'hex') as j, 4 | lt.v 5 | FROM localtrust as lt 6 | INNER JOIN fids as f1 on (f1.fid = cast(lt.i as int8)) 7 | INNER JOIN fids as f2 on (f2.fid = cast(lt.j as int8)) 8 | LEFT JOIN verifications as v1 on (v1.fid = f1.fid) 9 | LEFT JOIN verifications as v2 on (v2.fid = f2.fid) 10 | WHERE 11 | lt.strategy_id=3 12 | AND lt.date=(select max(date) from localtrust where strategy_id=3) -------------------------------------------------------------------------------- /pipeline/graph/export_l1rep6rec3m12enhancedConnections_fid.sql: -------------------------------------------------------------------------------- 1 | select 2 | i, 3 | j, 4 | v 5 | from 6 | localtrust 7 | where 8 | strategy_id=3 9 | and date=(select max(date) from localtrust where strategy_id=3) 10 | -- comment out below code for local testing 11 | -- AND i::integer < 10 12 | -- ORDER BY random() 13 | -- LIMIT 1000 -------------------------------------------------------------------------------- /pipeline/graph/rechunk_graph_pqt.py: -------------------------------------------------------------------------------- 1 | # standard dependencies 2 | from pathlib import Path 3 | import argparse 4 | import sys 5 | import os 6 | 7 | # local dependencies 8 | 9 | # 3rd party dependencies 10 | from loguru import logger 11 | import polars as pl 12 | 13 | def main(indir: Path, outfile: Path): 14 | 15 | logger.info(f"reading parquet files {indir}/*.pqt") 16 | pq_files = [os.path.join(indir, f) for f in os.listdir(indir) if f.endswith('.pqt')] 17 | if not pq_files: 18 | raise FileNotFoundError(f"No parquet files found in {indir}") 19 | 20 | # Read all parquet files into a list of DataFrames 21 | dfs = [] 22 | for file in pq_files: 23 | try: 24 | df = pl.read_parquet(file, rechunk=True, low_memory=False) 25 | dfs.append(df) 26 | logger.debug(f"Successfully read {file}") 27 | except Exception as e: 28 | logger.error(f"Error reading {file}: {e}") 29 | 30 | if not dfs: 31 | raise ValueError("No valid parquet files could be read") 32 | 33 | # Concatenate all DataFrames into a single DataFrame 34 | pq_df = pl.concat(dfs) 35 | 36 | logger.info(f"df estimated_size: {pq_df.estimated_size('mb')}") 37 | logger.info(f"df describe: {pq_df.describe()}") 38 | logger.info(f"df sample: {pq_df.sample(n=min(5, len(pq_df)))}") 39 | 40 | logger.info(f"writing to parquet file {outfile}") 41 | pq_df.write_parquet(outfile, 42 | use_pyarrow=True, 43 | statistics=True, 44 | pyarrow_options={ 45 | "write_statistics": True, 46 | "row_group_size": 100_000}) 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("-i", "--indir", 51 | help="input directory with all pqt files", 52 | required=True, 53 | type=lambda f: Path(f).expanduser().resolve()) 54 | parser.add_argument("-o", "--outfile", 55 | help="output filename", 56 | required=True, 57 | type=lambda f: Path(f).expanduser().resolve()) 58 | 59 | args = parser.parse_args() 60 | print(args) 61 | 62 | logger.remove() 63 | logger.add(sys.stderr, level='INFO') 64 | 65 | if os.path.isdir(args.outfile): 66 | logger.error("-o / --outfile should be a file not a directory") 67 | sys.exit(1) 68 | main(args.indir, args.outfile) 69 | -------------------------------------------------------------------------------- /pipeline/igraph-docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | igraph: 3 | build: 4 | dockerfile: igraph.Dockerfile 5 | deploy: 6 | mode: replicated 7 | replicas: 2 8 | image: igraph:latest 9 | volumes: 10 | - /home/ubuntu/serve_files:/home/ubuntu/serve_files:z 11 | expose: 12 | - '8000' 13 | restart: "on-failure" 14 | networks: 15 | - farcaster-network 16 | nginx: 17 | image: nginx:latest 18 | volumes: 19 | - ./igraph.nginx.conf:/etc/nginx/nginx.conf:ro 20 | depends_on: 21 | - igraph 22 | ports: 23 | - "4000:4000" 24 | networks: 25 | - farcaster-network 26 | 27 | networks: 28 | farcaster-network: 29 | name: farcaster-network 30 | external: true 31 | -------------------------------------------------------------------------------- /pipeline/igraph.Dockerfile: -------------------------------------------------------------------------------- 1 | # FROM python:3.12-alpine 2 | # not taking the alpine route because packages like psutil don't install without gcc 3 | FROM python:3.12-slim 4 | 5 | RUN pip install --upgrade pip 6 | 7 | WORKDIR /server 8 | 9 | # don't copy code yet otherwise docker layers will get invalidated every code push 10 | COPY ./requirements.txt /server 11 | 12 | RUN python -m ensurepip --upgrade 13 | RUN python -m pip install --no-cache-dir --upgrade -r requirements.txt 14 | 15 | # copy rest of the code 16 | COPY . /server 17 | 18 | CMD ["uvicorn", "graph.serve_igraph:app", "--host", "0.0.0.0", "--port", "8000", "--timeout-keep-alive", "300"] -------------------------------------------------------------------------------- /pipeline/igraph.nginx.conf: -------------------------------------------------------------------------------- 1 | user nginx; 2 | worker_processes auto; 3 | worker_rlimit_nofile 30000; 4 | 5 | events { 6 | worker_connections 4096; 7 | } 8 | 9 | http { 10 | keepalive_timeout 65; 11 | keepalive_requests 100000; 12 | tcp_nopush on; 13 | tcp_nodelay on; 14 | 15 | upstream igraph_servers { 16 | server igraph:8000; 17 | } 18 | 19 | server { 20 | listen 4000; 21 | 22 | location / { 23 | proxy_pass http://igraph_servers; 24 | proxy_connect_timeout 300s; 25 | proxy_send_timeout 300s; 26 | proxy_read_timeout 300s; 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /pipeline/logs/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/logs/.placeholder -------------------------------------------------------------------------------- /pipeline/plugins/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/.placeholder -------------------------------------------------------------------------------- /pipeline/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/__init__.py -------------------------------------------------------------------------------- /pipeline/plugins/hooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/hooks/__init__.py -------------------------------------------------------------------------------- /pipeline/plugins/hooks/common.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse, urlunparse 2 | from airflow.models import Variable 3 | 4 | def convert_hostname(url: str): 5 | # Parse the original URL 6 | parsed_url = urlparse(url) 7 | 8 | # Replace the scheme and netloc with the new hostname 9 | new_netloc = Variable.get("airflow_hostname") 10 | new_scheme = "https" 11 | 12 | # Construct the new URL 13 | return urlunparse((new_scheme, new_netloc) + parsed_url[2:]) -------------------------------------------------------------------------------- /pipeline/plugins/hooks/discord.py: -------------------------------------------------------------------------------- 1 | # copied from https://medium.com/@artur.aacs/airflow-send-alerts-with-discord-69f343dfa8dd 2 | import re 3 | from typing import Optional 4 | from datetime import datetime 5 | 6 | from airflow.models import Variable, TaskInstance 7 | from discord_webhook import DiscordWebhook, DiscordEmbed 8 | from hooks.common import convert_hostname 9 | 10 | TI = TaskInstance 11 | 12 | def send_alert_discord(context): 13 | # Get Task Instances variables 14 | last_task: Optional[TaskInstance] = context.get('task_instance') 15 | task_name = last_task.task_id 16 | dag_name = last_task.dag_id 17 | log_link = convert_hostname(last_task.log_url) 18 | execution_date = datetime.fromisoformat(str(context.get('execution_date'))) 19 | 20 | # Extract reason for the exception 21 | # try: 22 | # error_message = str(context["exception"]) 23 | # error_message = error_message[:1000] + (error_message[1000:] and '...') 24 | # str_start = re.escape("{'reason': ") 25 | # str_end = re.escape('"}.') 26 | # error_message = re.search('%s(.*)%s' % (str_start, str_end), error_message).group(1) 27 | # error_message = "{'reason': " + error_message + ',}' 28 | # except: 29 | # error_message = "Some error that cannot be extracted has occurred. Visit the logs!" 30 | 31 | print('Sending discord alert') 32 | 33 | # Send Alert 34 | webhook = DiscordWebhook(url=Variable.get("discord_webhook")) # Update variable name with your change 35 | print('execution_date', execution_date) 36 | embed = DiscordEmbed(title="Airflow Alert - Task has failed!", color='CC0000', url=log_link, timestamp=execution_date) 37 | embed.add_embed_field(name="DAG", value=dag_name, inline=True) 38 | embed.add_embed_field(name="PRIORITY", value="HIGH", inline=True) 39 | embed.add_embed_field(name="TASK", value=task_name, inline=False) 40 | embed.add_embed_field(name="ERROR", value=str(context["exception"])) 41 | webhook.add_embed(embed) 42 | response = webhook.execute() 43 | 44 | return response -------------------------------------------------------------------------------- /pipeline/plugins/hooks/pagerduty.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from airflow.providers.pagerduty.notifications.pagerduty import send_pagerduty_notification 4 | from airflow.providers.pagerduty.hooks.pagerduty_events import PagerdutyEventsHook 5 | from airflow.providers.pagerduty.hooks.pagerduty import PagerdutyHook 6 | 7 | from hooks.common import convert_hostname 8 | from airflow.models import Variable, TaskInstance 9 | 10 | # refer to https://github.com/astronomer/pagerduty_airflow_integration_benefits/blob/main/README.md 11 | def send_alert_pagerduty(context): 12 | # Get Task Instances variables 13 | last_task: Optional[TaskInstance] = context.get('task_instance') 14 | log_link = convert_hostname(last_task.log_url) 15 | print('log_link', log_link) 16 | 17 | task_id = last_task.task_id 18 | dag_id = last_task.dag_id 19 | # pagerduty_default needs to be saved on Admin->Variable on the console with Pagerduty Events 20 | integration_key=Variable.get("pagerduty_default") 21 | 22 | print('Sending pagerduty alert') 23 | return PagerdutyEventsHook(integration_key).send_event( 24 | summary=f"Airflow Alert - {dag_id}-{task_id} failed", 25 | severity="critical", 26 | source=f"airflow dag_id: {dag_id}", 27 | dedup_key=f"{dag_id}-{task_id}", 28 | group=f"{dag_id}", 29 | component="airflow", 30 | class_type="Prod Data Pipeline", 31 | custom_details=str(context["exception"]), 32 | links=[{ 33 | 'href': log_link, 34 | 'text': 'Link to errored task log' 35 | }], 36 | ) -------------------------------------------------------------------------------- /pipeline/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.1.3 2 | python-dotenv==1.0.0 3 | igraph==0.11.3 4 | psutil==5.9.8 5 | psycopg2-binary==2.9.9 6 | pydantic-settings==2.2.1 7 | sqlalchemy==1.4.52 8 | requests==2.31.0 9 | loguru==0.7.2 10 | beautifulsoup4==4.12.3 11 | aiohttp==3.9.3 12 | tldextract==5.1.1 13 | niquests==3.5.5 14 | polars==0.20.27 15 | pyarrow==16.1.0 16 | fastapi==0.111.0 17 | apache-airflow==2.9.2 18 | dune-client==1.7.4 19 | openrank-sdk==0.2.2 20 | apache-airflow-providers-ssh==3.12.0 21 | asyncpg==0.29.0 22 | tomlkit==0.13.2 -------------------------------------------------------------------------------- /pipeline/run_cast_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DT_FORMAT='%Y-%m-%d %H:%M:%S' 4 | 5 | # Function to validate date format 6 | function validate_date() { 7 | date_to_check="$1" 8 | date_format="$2" 9 | 10 | # Check if the date matches the format YYYY-mm-dd 11 | if [[ $(uname) == "Darwin" ]]; then 12 | if ! date -j -f "$date_format" "$date_to_check" >/dev/null 2>&1; then 13 | echo "Invalid date format. Use YYYY-mm-dd." 14 | exit 1 15 | fi 16 | else 17 | if ! date -d "$date_to_check" +"$date_format" >/dev/null 2>&1; then 18 | echo "Invalid date format. Use YYYY-mm-dd." 19 | exit 1 20 | fi 21 | fi 22 | 23 | # Check if the date is in the past 24 | today=$(date +"$date_format") 25 | if [ "$date_to_check" \> "$today" ] || [ "$date_to_check" == "$today" ]; then 26 | echo "The date must be in the past and not include today." 27 | exit 1 28 | fi 29 | } 30 | 31 | while getopts dv:f:t:p:m: flag 32 | do 33 | case "${flag}" in 34 | d) DAEMON_FLAG="--daemon";; 35 | v) VENV=${OPTARG};; 36 | f) FILL_TYPE=${OPTARG};; 37 | t) TARGET_DATE=${OPTARG};; 38 | m) TARGET_MONTH=${OPTARG};; 39 | p) POSTGRES=${OPTARG};; 40 | esac 41 | done 42 | 43 | if [ -z "$VENV" ]; then 44 | echo "Usage: $0 -v [venv] -p [postgres] -d -t [fill_type]" 45 | echo "" 46 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/" 47 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -p eigen8 -d -t backfill" 48 | echo "" 49 | echo "Params:" 50 | echo " [venv] The path where a python3 virtualenv has been created." 51 | echo " [postgres] 'eigen2' or 'eigen8'" 52 | echo " [daemon] Run in daemon mode." 53 | echo " [fill_type] Run in 'default' or 'backfill' or 'gapfill' mode." 54 | echo "" 55 | exit 56 | fi 57 | 58 | if [ ! -z "$POSTGRES" ]; then 59 | PG_OPTION="--postgres $POSTGRES" 60 | fi 61 | 62 | FILL_TYPE=${FILL_TYPE:-default} 63 | 64 | if [ ! -z "$TARGET_DATE" ]; then 65 | validate_date "$TARGET_DATE" "$DT_FORMAT" 66 | DATE_OPTION=(--target-date "$TARGET_DATE") 67 | fi 68 | 69 | # validating TARGET_MONTH in bash is a bit of a pain 70 | # ... let the python script validate it 71 | if [ ! -z "$TARGET_MONTH" ]; then 72 | MONTH_OPTION="--target-month $TARGET_MONTH" 73 | fi 74 | 75 | 76 | # set -x 77 | set -e 78 | set -o pipefail 79 | 80 | function log() { 81 | echo "`date` - $1" 82 | } 83 | 84 | source $VENV/bin/activate 85 | # pip install -r requirements.txt 86 | python3 -m casts.main $PG_OPTION $DAEMON_FLAG -f $FILL_TYPE "${DATE_OPTION[@]}" $MONTH_OPTION 87 | deactivate 88 | 89 | log "Done" -------------------------------------------------------------------------------- /pipeline/run_channel_metrics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts "w:v:rd" flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | r) RUN_FLAG="--run";; 9 | d) DRYRUN_FLAG="--dry-run";; 10 | esac 11 | done 12 | 13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$RUN_FLAG" ]; then 14 | echo "Usage: $0 -w [work_dir] -v [venv] -r -d" 15 | echo "" 16 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r" 17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r -d" 18 | echo "" 19 | echo "Params:" 20 | echo " [work_dir] The working directory to read .env file and execute scripts from." 21 | echo " [venv] The path where a python3 virtualenv has been created." 22 | echo " [run] Flag to run the script." 23 | echo " [dryrun] Flag to run the script in dry-run mode." 24 | echo "" 25 | exit 26 | fi 27 | 28 | source $WORK_DIR/.env 29 | 30 | # set -x 31 | set -e 32 | set -o pipefail 33 | 34 | function log() { 35 | echo "`date` - $1" 36 | } 37 | 38 | source $VENV/bin/activate 39 | #pip install -r requirements.txt 40 | python3 -m channels.main_metrics $RUN_FLAG $DRYRUN_FLAG 41 | deactivate 42 | -------------------------------------------------------------------------------- /pipeline/run_download_pqt_files_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # dayOfYear=`date '+%j'` 4 | # hourOfDay=`date '+%H'` 5 | # hourOfYear="$((dayOfYear * 24 + hourOfDay))" 6 | # echo $dayOfYear $hourOfDay $hourOfYear 7 | # hour_interval=48 8 | 9 | # # TODO use the mtime of the existing parquet file and 10 | # # ..if current time - mtime > 1 hour, start compute 11 | # if [ `expr $hourOfYear % $hour_interval` -eq 0 ]; then 12 | # echo "This is hour $hour_interval. Continuing with script." 13 | # else 14 | # echo "This not hour $hour_interval. Exiting now." 15 | # exit 0 16 | # fi 17 | 18 | 19 | while getopts o:s: flag 20 | do 21 | case "${flag}" in 22 | o) OUT_DIR=${OPTARG};; 23 | s) S3_BKT=${OPTARG};; 24 | esac 25 | done 26 | 27 | if [ -z "$OUT_DIR" ] || [ -z "$S3_BKT" ]; then 28 | echo "Usage: $0 -o [out_dir] -s [s3_bkt]" 29 | echo "" 30 | echo "Example: $0 \ " 31 | echo " -i /home/ubuntu/serve_files/lt_engagement_fid.csv \ " 32 | echo " -w . \ " 33 | echo " -v .venv \ " 34 | echo " -o /tmp/personal-graph/ \ " 35 | echo " -s k3l-openrank-farcaster \ " 36 | echo "" 37 | echo "Params:" 38 | echo " [in_csv] The source file to read dataframe from." 39 | echo " [out_dir] The output directory to write the graph file." 40 | echo " [work_dir] The working directory to read .env file and execute scripts from." 41 | echo " [venv] The path where a python3 virtualenv has been created." 42 | echo " [s3_bkt] The S3 bucket to upload the graph file to." 43 | echo " [task] task to run. choose one: graph_reload, generate, fetch_fids, consolidate" 44 | echo " [fids] comma separated fids to run '1,2,3,420,69'" 45 | echo " [run_id] airflow run id. eg) 'manual__2024-07-22T06:46:15.813325+00:00' " 46 | echo " [map_index] airflow map index" 47 | echo "" 48 | exit 49 | fi 50 | 51 | source $WORK_DIR/.env 52 | 53 | set -x 54 | set -e 55 | set -o pipefail 56 | 57 | aws s3 cp s3://${S3_BKT}/personal_graph.parquet $OUT_DIR/personal_graph.parquet -------------------------------------------------------------------------------- /pipeline/run_eigen2_postgres_sql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while getopts w: flag 3 | do 4 | case "${flag}" in 5 | w) WORK_DIR=${OPTARG};; 6 | esac 7 | done 8 | 9 | shift $((OPTIND-1)) 10 | SQL_STATEMENT="$1" 11 | 12 | if [ -z "$WORK_DIR" ]; then 13 | echo "Usage: $0 -w [work_dir] [sql_statement]" 14 | echo "" 15 | echo "Example: $0 -w . -c 'REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank;'" 16 | echo "" 17 | echo "Params:" 18 | echo " [work_dir] The working directory to read .env file and execute scripts from." 19 | echo " [sql_statement] Optional sql statement to execute." 20 | echo "" 21 | exit 1 22 | fi 23 | 24 | source $WORK_DIR/.env 25 | 26 | DB_HOST=${DB_HOST:-127.0.0.1} 27 | DB_PORT=${DB_PORT:-5432} 28 | DB_USER=${DB_USER:-replicator} 29 | DB_NAME=${DB_NAME:-replicator} 30 | DB_PASSWORD=${DB_PASSWORD:-password} # psql requires PGPASSWORD to be set 31 | 32 | # set -x 33 | set -e 34 | set -o pipefail 35 | 36 | if hash psql 2>/dev/null; then 37 | echo "OK, you have psql in the path. We’ll use that." 38 | PSQL=psql 39 | else 40 | echo "You don't have psql is the path. Let's try /usr/bin" 41 | hash /usr/bin/psql 42 | PSQL=/usr/bin/psql 43 | fi 44 | 45 | PGPASSWORD=$DB_PASSWORD $PSQL -e -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \ 46 | -c "$SQL_STATEMENT" -------------------------------------------------------------------------------- /pipeline/run_eigen8_postgres_sql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while getopts w: flag 3 | do 4 | case "${flag}" in 5 | w) WORK_DIR=${OPTARG};; 6 | esac 7 | done 8 | 9 | shift $((OPTIND-1)) 10 | SQL_STATEMENT="$1" 11 | 12 | if [ -z "$WORK_DIR" ]; then 13 | echo "Usage: $0 -w [work_dir] [sql_statement]" 14 | echo "" 15 | echo "Example: $0 -w . -c 'REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank;'" 16 | echo "" 17 | echo "Params:" 18 | echo " [work_dir] The working directory to read .env file and execute scripts from." 19 | echo " [sql_statement] Optional sql statement to execute." 20 | echo "" 21 | exit 1 22 | fi 23 | 24 | source $WORK_DIR/.env 25 | 26 | ALT_REMOTE_DB_HOST=${ALT_REMOTE_DB_HOST:-127.0.0.1} 27 | ALT_REMOTE_DB_PORT=${ALT_REMOTE_DB_PORT:-5432} 28 | ALT_REMOTE_DB_USER=${ALT_REMOTE_DB_USER:-k3l_user} 29 | ALT_REMOTE_DB_NAME=${ALT_REMOTE_DB_NAME:-farcaster} 30 | ALT_REMOTE_DB_PASSWORD=${ALT_REMOTE_DB_PASSWORD:-password} # psql requires PGPASSWORD to be set 31 | 32 | # set -x 33 | set -e 34 | set -o pipefail 35 | 36 | if hash psql 2>/dev/null; then 37 | echo "OK, you have psql in the path. We’ll use that." 38 | PSQL=psql 39 | else 40 | echo "You don't have psql is the path. Let's try /usr/bin" 41 | hash /usr/bin/psql 42 | PSQL=/usr/bin/psql 43 | fi 44 | 45 | PGPASSWORD=$ALT_REMOTE_DB_PASSWORD $PSQL -e -h $ALT_REMOTE_DB_HOST \ 46 | -p $ALT_REMOTE_DB_PORT -U $ALT_REMOTE_DB_USER -d $ALT_REMOTE_DB_NAME \ 47 | -c "$SQL_STATEMENT" -------------------------------------------------------------------------------- /pipeline/run_fetch_channel_top_caster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts w:i:v:c: flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | c) CSV_PATH=${OPTARG};; 9 | esac 10 | done 11 | 12 | shift $((OPTIND-1)) 13 | CHANNEL_IDS="$1" 14 | 15 | if [ -z "$VENV" ] || [ -z "$CSV_PATH" ]; then 16 | echo "Usage: $0 -w [work_dir] -v [venv] -c [csv_path] [channel_ids]" 17 | echo "" 18 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Top_Channels.csv" 19 | echo "" 20 | echo "Params:" 21 | echo " [work_dir] The working directory to read .env file and execute scripts from." 22 | echo " [venv] The path where a python3 virtualenv has been created." 23 | echo " [csv_path] The path to the CSV file." 24 | echo "" 25 | exit 1 26 | fi 27 | 28 | log() { 29 | echo "`date` - $1" 30 | } 31 | 32 | log "Starting script with parameters: WORK_DIR=${WORK_DIR}, VENV=${VENV}, CSV_PATH=${CSV_PATH}" 33 | 34 | source $WORK_DIR/.env 35 | 36 | set -e 37 | set -o pipefail 38 | 39 | function log() { 40 | echo "`date` - $1" 41 | } 42 | 43 | log "Activating virtual environment" 44 | source $VENV/bin/activate 45 | # pip install -r requirements.txt 46 | log "Executing task" 47 | python3 -m channels.main_fetch_channel_top_casters -c "$CSV_PATH" 48 | deactivate 49 | 50 | -------------------------------------------------------------------------------- /pipeline/run_fetch_top_caster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts v:s: flag 4 | do 5 | case "${flag}" in 6 | v) VENV=${OPTARG};; 7 | esac 8 | done 9 | 10 | if [ -z "$VENV" ]; then 11 | echo "Usage: $0 -v [venv]" 12 | echo "" 13 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/" 14 | echo "" 15 | echo "Params:" 16 | echo " [venv] The path where a python3 virtualenv has been created." 17 | echo "" 18 | exit 19 | fi 20 | 21 | # set -x 22 | set -e 23 | set -o pipefail 24 | 25 | function log() { 26 | echo "`date` - $1" 27 | } 28 | 29 | source $VENV/bin/activate 30 | # pip install -r requirements.txt 31 | python3 -m casts.main_fetch_top_casters 32 | deactivate 33 | -------------------------------------------------------------------------------- /pipeline/run_fetch_top_spammers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts v:s: flag 4 | do 5 | case "${flag}" in 6 | v) VENV=${OPTARG};; 7 | esac 8 | done 9 | 10 | if [ -z "$VENV" ] ; then 11 | echo "Usage: $0 -v [venv]" 12 | echo "" 13 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/" 14 | echo "" 15 | echo "Params:" 16 | echo " [venv] The path where a python3 virtualenv has been created." 17 | echo "" 18 | exit 19 | fi 20 | 21 | # set -x 22 | set -e 23 | set -o pipefail 24 | 25 | function log() { 26 | echo "`date` - $1" 27 | } 28 | 29 | source $VENV/bin/activate 30 | # pip install -r requirements.txt 31 | python3 -m casts.main_fetch_top_spammers 32 | deactivate 33 | -------------------------------------------------------------------------------- /pipeline/run_frame_scraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts d:v: flag 4 | do 5 | case "${flag}" in 6 | d) DAEMON=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | esac 9 | done 10 | 11 | if [ -z "$VENV" ]; then 12 | echo "Usage: $0 -v [venv]" 13 | echo "" 14 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/" 15 | echo "" 16 | echo "Params:" 17 | echo " [venv] The path where a python3 virtualenv has been created." 18 | echo "" 19 | exit 20 | fi 21 | 22 | # set -x 23 | set -e 24 | set -o pipefail 25 | 26 | function log() { 27 | echo "`date` - $1" 28 | } 29 | 30 | DAEMON=${DAEMON:-false} 31 | 32 | source $VENV/bin/activate 33 | # pip install -r requirements.txt 34 | mkdir -p tmp/tldcache 35 | export TLDEXTRACT_CACHE=tmp/tldcache 36 | python3 -m frames.main -d $DAEMON 37 | deactivate 38 | -------------------------------------------------------------------------------- /pipeline/run_graph_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts i:o:p:w:v: flag 4 | do 5 | case "${flag}" in 6 | i) IN_FILE=${OPTARG};; 7 | o) OUT_DIR=${OPTARG};; 8 | p) OUT_PREFIX=${OPTARG};; 9 | w) WORK_DIR=${OPTARG};; 10 | v) VENV=${OPTARG};; 11 | esac 12 | done 13 | 14 | if [ -z "$IN_FILE" ] || [ -z "$OUT_DIR" ] || [ -z "$OUT_PREFIX" ] || [ -z "$WORK_DIR" ] || [ -z "$VENV" ]; then 15 | echo "Usage: $0 -w [work_dir] -v [venv] -i [in_file] -o [out_dir] -p [out_prefix]" 16 | echo "" 17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -i /tmp -o /tmp -p test" 18 | echo "" 19 | echo "Params:" 20 | echo " [work_dir] The working directory to read .env file and execute scripts from." 21 | echo " [venv] The path where a python3 virtualenv has been created." 22 | echo " [in_file] The input localtrust (i,j,v edge list) csv file." 23 | echo " [out_dir] The output directory to write the graph file." 24 | echo " [out_prefix] The prefix of the output graph files." 25 | echo "" 26 | exit 27 | fi 28 | 29 | 30 | source $WORK_DIR/.env 31 | 32 | # set -x 33 | set -e 34 | set -o pipefail 35 | 36 | function log() { 37 | echo "`date` - $1" 38 | } 39 | 40 | mkdir -p $OUT_DIR 41 | 42 | source $VENV/bin/activate 43 | #pip install -r requirements.txt 44 | python3 -m graph.gen_igraph -i $IN_FILE -o $OUT_DIR -p $OUT_PREFIX 45 | touch $OUT_DIR/${OUT_PREFIX}_SUCCESS 46 | deactivate 47 | -------------------------------------------------------------------------------- /pipeline/run_notify_channel_daily_trending.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts "w:v:c:d" flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | c) CSV_PATH=${OPTARG};; 9 | d) DRYRUN_FLAG="--dry-run";; 10 | esac 11 | done 12 | 13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$CSV_PATH" ]; then 14 | echo "Usage: $0 -w [work_dir] -v [venv] -c [csv_path] -d" 15 | echo "" 16 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Trending_Channels.csv" 17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Trending_Channels.csv -d" 18 | echo "" 19 | echo "Params:" 20 | echo " [work_dir] The working directory to read .env file and execute scripts from." 21 | echo " [venv] The path where a python3 virtualenv has been created." 22 | echo " [csv_path] Path to CSV file." 23 | echo " [dryrun] Flag to run the script in dry-run mode." 24 | echo "" 25 | exit 26 | fi 27 | 28 | source $WORK_DIR/.env 29 | 30 | # set -x 31 | set -e 32 | set -o pipefail 33 | 34 | function log() { 35 | echo "`date` - $1" 36 | } 37 | 38 | source $VENV/bin/activate 39 | #pip install -r requirements.txt 40 | python3 -m channels.main_notify_daily_trending -c "$CSV_PATH" $DRYRUN_FLAG 41 | deactivate 42 | -------------------------------------------------------------------------------- /pipeline/run_notify_channel_leaderboard.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts "w:v:rd" flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | r) RUN_FLAG="--run";; 9 | d) DRYRUN_FLAG="--dry-run";; 10 | esac 11 | done 12 | 13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$RUN_FLAG" ]; then 14 | echo "Usage: $0 -w [work_dir] -v [venv] -r -d" 15 | echo "" 16 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r" 17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r -d" 18 | echo "" 19 | echo "Params:" 20 | echo " [work_dir] The working directory to read .env file and execute scripts from." 21 | echo " [venv] The path where a python3 virtualenv has been created." 22 | echo " [run] Flag to run the script." 23 | echo " [dryrun] Flag to run the script in dry-run mode." 24 | echo "" 25 | exit 26 | fi 27 | 28 | source $WORK_DIR/.env 29 | 30 | # set -x 31 | set -e 32 | set -o pipefail 33 | 34 | function log() { 35 | echo "`date` - $1" 36 | } 37 | 38 | source $VENV/bin/activate 39 | #pip install -r requirements.txt 40 | python3 -m channels.main_notify_leaderboard $RUN_FLAG $DRYRUN_FLAG 41 | deactivate 42 | -------------------------------------------------------------------------------- /pipeline/run_notify_channel_weekly_mods.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts "w:v:b:s:d" flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | b) BOTS_CSV=${OPTARG};; 9 | s) SINCE_DATETIME=${OPTARG};; 10 | d) DRYRUN_FLAG="--dry-run";; 11 | esac 12 | done 13 | 14 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$BOTS_CSV" ] || [ -z "$SINCE_DATETIME" ]; then 15 | echo "Usage: $0 -w [work_dir] -v [venv] -b [bots_csv] -s [since_datetime] -d" 16 | echo "" 17 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -b channels/Bot_Fids.csv -s '2025-04-23 16:30:00+00:00'" 18 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -b channels/Bot_Fids.csv -s '2025-04-23 16:30:00+00:00' -d" 19 | echo "" 20 | echo "Params:" 21 | echo " [work_dir] The working directory to read .env file and execute scripts from." 22 | echo " [venv] The path where a python3 virtualenv has been created." 23 | echo " [bots_csv] The path to the CSV file that has list of mod bots." 24 | echo " [since_datetime] The datetime to get notifications since." 25 | echo " [dryrun] Flag to run the script in dry-run mode." 26 | echo "" 27 | exit 28 | fi 29 | 30 | source $WORK_DIR/.env 31 | 32 | # set -x 33 | set -e 34 | set -o pipefail 35 | 36 | function log() { 37 | echo "`date` - $1" 38 | } 39 | 40 | source $VENV/bin/activate 41 | #pip install -r requirements.txt 42 | python3 -m channels.main_notify_weekly_mods -b "$BOTS_CSV" -s "$SINCE_DATETIME" $DRYRUN_FLAG 43 | deactivate 44 | -------------------------------------------------------------------------------- /pipeline/run_update_channel_points.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts w:v:t:p:g: flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | t) TASK=${OPTARG};; 9 | p) POSTGRES=${OPTARG};; 10 | g) GAPFILL_DATE=${OPTARG};; 11 | esac 12 | done 13 | 14 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$TASK" ]; then 15 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task]" 16 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task] -p [postgres]" 17 | echo "Usage: $0 -w [work_dir] -v [venv] -t gapfill -p [postgres] -g [gapfill_date] " 18 | echo "" 19 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t genesis" 20 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t compute" 21 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t update" 22 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t update -p eigen8 -g 2025-04-01" 23 | echo "" 24 | echo "Params:" 25 | echo " [work_dir] The working directory to read .env file and execute scripts from." 26 | echo " [venv] The path where a python3 virtualenv has been created." 27 | echo " [task] The task to perform: daily or distrib." 28 | echo " [postgres] The name of the postgres database to connect to." 29 | echo " [gapfill_date] The date to use for gapfilling in YYYY-MM-DD format." 30 | echo "" 31 | exit 32 | fi 33 | 34 | if [ ! -z "$POSTGRES" ]; then 35 | PG_OPTION="--postgres $POSTGRES" 36 | fi 37 | 38 | if [ "$TASK" = "gapfill" ]; then 39 | if [ -z "$GAPFILL_DATE" ]; then 40 | echo "Please specify -g (gapfill_date) for the gapfill task." 41 | exit 1 42 | fi 43 | fi 44 | 45 | # validating TARGET_MONTH in bash is a bit of a pain 46 | # ... let the python script validate it 47 | if [ ! -z "$GAPFILL_DATE" ]; then 48 | GAPFILL_OPTION="--gapfill-date $GAPFILL_DATE" 49 | fi 50 | 51 | source $WORK_DIR/.env 52 | 53 | # set -x 54 | set -e 55 | set -o pipefail 56 | 57 | function log() { 58 | echo "`date` - $1" 59 | } 60 | 61 | source $VENV/bin/activate 62 | #pip install -r requirements.txt 63 | python3 -m channels.main_points -t "$TASK" $PG_OPTION $GAPFILL_OPTION 64 | deactivate 65 | -------------------------------------------------------------------------------- /pipeline/run_update_channel_tokens.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts w:v:t:s:r:p: flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | v) VENV=${OPTARG};; 8 | t) TASK=${OPTARG};; 9 | s) SCOPE=${OPTARG};; 10 | r) REASON=${OPTARG};; 11 | p) POSTGRES=${OPTARG};; 12 | esac 13 | done 14 | 15 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$TASK" ]; then 16 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task]" 17 | echo "Usage: $0 -w [work_dir] -v [venv] -t [task] -s [scope] -r [reason] -p [postgres]" 18 | echo "" 19 | echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t prep -s weekly -r reason -p eigen8" 20 | echo " $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t distrib" 21 | echo " $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t verify" 22 | echo "" 23 | echo "Params:" 24 | echo " [work_dir] The working directory to read .env file and execute scripts from." 25 | echo " [venv] The path where a python3 virtualenv has been created." 26 | echo " [task] The task to perform: prep or distrib or verify." 27 | echo " [scope] The scope of channels to import: airdrop or daily calculation." 28 | echo " [reason] The reason for the distribution." 29 | echo " [postgres] The name of the postgres database to connect to." 30 | echo "" 31 | exit 32 | fi 33 | 34 | if [ "$TASK" = "prep" ]; then 35 | if [ -z "$SCOPE" ] || [ -z "$REASON" ]; then 36 | echo "Please specify -s (scope) and -r (reason) for the prep task." 37 | exit 1 38 | fi 39 | fi 40 | 41 | if [ ! -z "$POSTGRES" ]; then 42 | PG_OPTION="--postgres $POSTGRES" 43 | fi 44 | 45 | source $WORK_DIR/.env 46 | 47 | # set -x 48 | set -e 49 | set -o pipefail 50 | 51 | function log() { 52 | echo "`date` - $1" 53 | } 54 | 55 | source $VENV/bin/activate 56 | #pip install -r requirements.txt 57 | if [ "$TASK" = "prep" ]; then 58 | python3 -m channels.main_tokens -t prep -s "$SCOPE" -r "$REASON" $PG_OPTION 59 | deactivate 60 | elif [ "$TASK" = "distrib" ]; then 61 | python3 -m channels.main_tokens -t distrib $PG_OPTION 62 | deactivate 63 | elif [ "$TASK" = "verify" ]; then 64 | python3 -m channels.main_tokens -t verify $PG_OPTION 65 | deactivate 66 | else 67 | echo "Invalid task specified. Use 'prep', 'distrib' or 'verify'." 68 | exit 1 69 | fi 70 | -------------------------------------------------------------------------------- /pipeline/samples/pretrust.csv: -------------------------------------------------------------------------------- 1 | i,v 2 | 2,0.5 3 | 3,0.5 4 | -------------------------------------------------------------------------------- /pipeline/schema/globaltrust_config.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- PostgreSQL database dump 3 | -- 4 | 5 | -- Dumped from database version 16.2 6 | -- Dumped by pg_dump version 16.2 7 | 8 | SET statement_timeout = 0; 9 | SET lock_timeout = 0; 10 | SET idle_in_transaction_session_timeout = 0; 11 | SET client_encoding = 'UTF8'; 12 | SET standard_conforming_strings = on; 13 | SELECT pg_catalog.set_config('search_path', '', false); 14 | SET check_function_bodies = false; 15 | SET xmloption = content; 16 | SET client_min_messages = warning; 17 | SET row_security = off; 18 | 19 | SET default_tablespace = ''; 20 | 21 | SET default_table_access_method = heap; 22 | 23 | -- 24 | -- Name: globaltrust_config; Type: TABLE; Schema: public; Owner: k3l_user 25 | -- 26 | 27 | CREATE TABLE public.globaltrust_config ( 28 | strategy_id integer NOT NULL, 29 | strategy_name character varying(255) NOT NULL, 30 | pretrust text, 31 | localtrust text, 32 | alpha real, 33 | date date DEFAULT CURRENT_TIMESTAMP NOT NULL 34 | ); 35 | 36 | 37 | ALTER TABLE public.globaltrust_config OWNER TO k3l_user; 38 | 39 | -- 40 | -- Data for Name: globaltrust_config; Type: TABLE DATA; Schema: public; Owner: k3l_user 41 | -- 42 | 43 | COPY public.globaltrust_config (strategy_id, strategy_name, pretrust, localtrust, alpha, date) FROM stdin; 44 | 1 follows pretrustAllEqually existingConnections 0.5 2023-12-07 45 | 3 engagement pretrustAllEqually l1rep6rec3m12enhancedConnections 0.5 2023-12-07 46 | 5 activity pretrustAllEqually l1rep1rec1m1enhancedConnections 0.5 2023-12-07 47 | 7 OG circles pretrustSpecificUsernames existingConnections 0.5 2023-12-07 48 | 9 OG engagement pretrustSpecificUsernames l1rep6rec3m12enhancedConnections 0.5 2023-12-07 49 | 11 OG activity pretrustSpecificUsernames l1rep1rec1m1enhancedConnections 0.5 2023-12-07 50 | 1 follows pretrustTopTier existingConnections 0.5 2024-03-14 51 | 3 engagement pretrustTopTier l1rep6rec3m12enhancedConnections 0.5 2024-03-14 52 | 1 follows pretrustTopTier existingConnections 0.5 2024-09-27 53 | 3 engagement pretrustTopTier l1rep6rec3m12enhancedConnections 0.5 2024-09-27 54 | 9 v3engagement v2pretrustTopTier followsboostedl1rep3rec6m12 0.5 2024-09-27 55 | \. 56 | 57 | 58 | -- 59 | -- Name: globaltrust_config globaltrust_config_pkey; Type: CONSTRAINT; Schema: public; Owner: k3l_user 60 | -- 61 | 62 | ALTER TABLE ONLY public.globaltrust_config 63 | ADD CONSTRAINT globaltrust_config_pkey PRIMARY KEY (strategy_id, date); 64 | 65 | 66 | -- 67 | -- Name: TABLE globaltrust_config; Type: ACL; Schema: public; Owner: k3l_user 68 | -- 69 | 70 | GRANT SELECT,REFERENCES ON TABLE public.globaltrust_config TO k3l_readonly; 71 | 72 | 73 | -- 74 | -- PostgreSQL database dump complete 75 | -- 76 | 77 | -------------------------------------------------------------------------------- /pipeline/scripts/archived/run_create_degen_db_functions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | while getopts v:t: flag 7 | do 8 | case "${flag}" in 9 | v) VENV=${OPTARG};; 10 | t) TASK=${OPTARG};; 11 | esac 12 | done 13 | 14 | if [ -z "$VENV" ]; then 15 | echo "Usage: $0 -v [venv]" 16 | echo "" 17 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/" 18 | echo "" 19 | echo "Params:" 20 | echo " [venv] The path where a python3 virtualenv has been created." 21 | echo " [task] The task to perform: 'extract' or 'insert_scores'." 22 | echo "" 23 | exit 24 | fi 25 | 26 | # set -x 27 | set -e 28 | set -o pipefail 29 | 30 | source $VENV/bin/activate 31 | # pip install -r requirements.txt 32 | 33 | echo "Executing task: $TASK" 34 | if [ "$TASK" = "extract" ]; then 35 | python3 -m degen.create_degen_sql_functions 36 | elif [ "$TASK" = "insert_scores" ]; then 37 | python3 -m degen.calculate_rank 38 | else 39 | echo "Invalid task specified. Use 'extract' or 'insert_scores'." 40 | exit 1 41 | fi 42 | deactivate 43 | -------------------------------------------------------------------------------- /pipeline/scripts/archived/run_sandbox_backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source .env 4 | 5 | set -x 6 | set -e # Exit immediately if a command exits with a non-zero status 7 | set -o pipefail # Ensure pipeline failures are propagated 8 | 9 | 10 | # TODO: move this to cli args 11 | DATE_SUFFIX=$(date +"%Y%m%d" ) 12 | BACKUP_DIR="/tmp/sandbox-backup-$DATE_SUFFIX" 13 | BACKUP_FILE="sandbox_pgdump" 14 | S3_BUCKET='k3l-farcaster-backups' 15 | S3_PREFIX='pg_dump/' 16 | 17 | #DB details 18 | DB_NAME=$SANDBOX_DB_NAME 19 | DB_USER=$SANDBOX_DB_USER 20 | DB_PASSWORD=$SANDBOX_DB_PASSWORD 21 | DB_HOST=$SANDBOX_DB_HOST 22 | DB_PORT=$SSH_LISTEN_PORT 23 | 24 | rm -rf "$BACKUP_DIR" 25 | mkdir -p "$BACKUP_DIR" 26 | 27 | # Perform the backup 28 | echo "Starting backup..." 29 | set +x # Disable command echoing 30 | export PGPASSWORD="$DB_PASSWORD" 31 | set -x # Re-enable command echoing 32 | pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \ 33 | -j 1 \ 34 | -Fd \ 35 | -f "$BACKUP_DIR/$BACKUP_FILE" 36 | unset PGPASSWORD 37 | 38 | # Check if backup was successful 39 | if [ $? -eq 0 ]; then 40 | echo "Backup completed successfully" 41 | 42 | # Compress the backup 43 | tar czf "$BACKUP_DIR/$BACKUP_FILE.tgz" -C "$BACKUP_DIR" $BACKUP_FILE 44 | echo "Backup compressed" 45 | 46 | # Upload to S3 47 | echo "Uploading backup to S3..." 48 | aws s3 cp "$BACKUP_DIR/$BACKUP_FILE.tgz" "s3://$S3_BUCKET/$S3_PREFIX$BACKUP_FILE.tgz" 49 | 50 | if [ $? -eq 0 ]; then 51 | echo "Backup successfully uploaded to S3" 52 | rm -rf "$BACKUP_DIR" 53 | else 54 | echo "Failed to upload backup to S3" 55 | exit 1 56 | fi 57 | else 58 | echo "Backup failed" 59 | exit 1 60 | fi 61 | 62 | exit 0 63 | -------------------------------------------------------------------------------- /pipeline/scripts/archived/run_urlextract_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts w: flag 4 | do 5 | case "${flag}" in 6 | w) WORK_DIR=${OPTARG};; 7 | esac 8 | done 9 | 10 | if [ -z "$WORK_DIR" ]; then 11 | echo "Usage: $0 -w [work_dir]" 12 | echo "" 13 | echo "Example: $0 -w ." 14 | echo "" 15 | echo "Params:" 16 | echo " [work_dir] The working directory to read .env file and execute scripts from." 17 | echo "" 18 | exit 19 | fi 20 | 21 | source $WORK_DIR/.env 22 | 23 | DB_HOST=${DB_HOST:-127.0.0.1} 24 | DB_PORT=${DB_PORT:-5432} 25 | DB_USER=${DB_USER:-replicator} 26 | DB_NAME=${DB_NAME:-replicator} 27 | DB_PASSWORD=${DB_PASSWORD:-password} # psql requires PGPASSWORD to be set 28 | 29 | # set -x 30 | set -e 31 | set -o pipefail 32 | 33 | if hash psql 2>/dev/null; then 34 | echo "OK, you have psql in the path. We’ll use that." 35 | PSQL=psql 36 | else 37 | echo "You don't have psql is the path. Let's try /usr/bin" 38 | hash /usr/bin/psql 39 | PSQL=/usr/bin/psql 40 | fi 41 | 42 | function log() { 43 | echo "`date` - $1" 44 | } 45 | 46 | log "Inserting into k3l_url_labels" 47 | PGPASSWORD=$DB_PASSWORD \ 48 | $PSQL -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \ 49 | -f $WORK_DIR/frames/incremental_load_labels.sql 50 | 51 | wait $! 52 | 53 | log "Inserting into k3l_cast_embed_url_mapping" 54 | PGPASSWORD=$DB_PASSWORD \ 55 | $PSQL -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \ 56 | -f $WORK_DIR/frames/incremental_load_cast_mapping.sql 57 | 58 | wait $! 59 | 60 | this_name=`basename "$0"` 61 | log "$this_name done!" -------------------------------------------------------------------------------- /pipeline/scripts/one_off/diff_db_table.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | print("Not implemented") 3 | pass -------------------------------------------------------------------------------- /pipeline/scripts/one_off/run_cast_pipeline_gapfills.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | date_format='%Y-%m-%d' 4 | 5 | # Function to validate date format 6 | function validate_date() { 7 | date_to_check=$1 8 | 9 | # Check if the date matches the format YYYY-mm-dd 10 | if [[ $(uname) == "Darwin" ]]; then 11 | if ! date -j -f "$date_format" "$date_to_check" >/dev/null 2>&1; then 12 | echo "Invalid date format. Use YYYY-mm-dd." 13 | exit 1 14 | fi 15 | else 16 | if ! date -d "$date_to_check" +"$date_format" >/dev/null 2>&1; then 17 | echo "Invalid date format. Use YYYY-mm-dd." 18 | exit 1 19 | fi 20 | fi 21 | 22 | # Check if the date is in the past 23 | today=$(date +"$date_format") 24 | if [ "$date_to_check" \> "$today" ] || [ "$date_to_check" == "$today" ]; then 25 | echo "The date must be in the past and not include today." 26 | exit 1 27 | fi 28 | } 29 | 30 | while getopts v:s:p:e:l: flag 31 | do 32 | case "${flag}" in 33 | v) VENV=${OPTARG};; 34 | s) START_DATE=${OPTARG};; 35 | e) END_DATE=${OPTARG};; 36 | p) POSTGRES=${OPTARG};; 37 | l) SLEEP_TIME=${OPTARG};; 38 | esac 39 | done 40 | 41 | if [ -z "$VENV" ] || [ -z "$START_DATE" ] || [ -z "$END_DATE" ]; then 42 | echo "Usage: $0 -v [venv] -s [start_date] -e [end_date]" 43 | echo "Usage: $0 -v [venv] -s [start_date] -e [end_date] -p [postgres] -l [sleep_time]" 44 | echo "" 45 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -s 2025-02-01 -e 2025-02-05" 46 | echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -s 2025-02-01 -e 2025-02-05 -p eigen8" 47 | echo "" 48 | echo "Params:" 49 | echo " [venv] The path where a python3 virtualenv has been created." 50 | echo " [start_date] The date to start the gapfilling process." 51 | echo " [end_date] The date to end the gapfilling process." 52 | echo " [postgres] 'eigen2' or 'eigen8'" 53 | echo " [sleep_time] The amount of time to sleep between gapfill runs." 54 | echo "" 55 | exit 56 | fi 57 | 58 | if [ ! -z "$POSTGRES" ]; then 59 | PG_OPTION="--postgres $POSTGRES" 60 | fi 61 | 62 | validate_date $START_DATE 63 | validate_date $END_DATE 64 | 65 | SLEEP_TIME=${SLEEP_TIME:-30s} 66 | 67 | 68 | # set -x 69 | set -e 70 | set -o pipefail 71 | 72 | function log() { 73 | echo "`date` - $1" 74 | } 75 | 76 | source $VENV/bin/activate 77 | # pip install -r requirements.txt 78 | while [[ $START_DATE < $END_DATE ]]; do 79 | DATE_OPTION=(--target-date "$START_DATE 00:00:00") 80 | FILL_TYPE="gapfill" 81 | DAEMON_FLAG="" 82 | log "Running gapfill for $START_DATE" 83 | python3 -m casts.main $PG_OPTION $DAEMON_FLAG -f $FILL_TYPE "${DATE_OPTION[@]}" 84 | log "Sleeping for $SLEEP_TIME" 85 | sleep $SLEEP_TIME 86 | START_DATE=$(date -I -d "$START_DATE + 1 day") 87 | done 88 | deactivate 89 | 90 | log "Done" 91 | -------------------------------------------------------------------------------- /pipeline/sshtunnel.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.8 2 | 3 | RUN apk add --no-cache autossh libressl 4 | 5 | RUN mkdir -p ~/.ssh 6 | 7 | ENTRYPOINT ["/usr/bin/autossh", \ 8 | "-M", "0", "-T", "-N", "-g", "-v", \ 9 | "-oStrictHostKeyChecking=no", \ 10 | "-oServerAliveInterval=180", \ 11 | "-oUserKnownHostsFile=/dev/null", \ 12 | "-oGlobalKnownHostsFile=/dev/null", \ 13 | "-i/root/.ssh/id_rsa"] -------------------------------------------------------------------------------- /pipeline/timer.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://realpython.com/python-timer/#the-python-timer-code 2 | import time 3 | from contextlib import ContextDecorator 4 | from dataclasses import dataclass, field 5 | from typing import Any, Callable, ClassVar, Dict, Optional 6 | 7 | class TimerError(Exception): 8 | """A custom exception used to report errors in use of Timer class""" 9 | 10 | @dataclass 11 | class Timer(ContextDecorator): 12 | """Time your code using a class, context manager, or decorator 13 | Class: 14 | ====== 15 | t = Timer(name="class") 16 | t.start() 17 | # Do something 18 | t.stop() 19 | Context Manager: 20 | ================ 21 | with Timer(name="context manager"): 22 | # Do something 23 | Decorator: 24 | ========== 25 | @Timer(name="decorator") 26 | def stuff(): 27 | # Do something 28 | """ 29 | 30 | timers: ClassVar[Dict[str, float]] = {} 31 | name: Optional[str] = None 32 | text: str = "Elapsed time: {n} took {t:0.4f} seconds" 33 | logger: Optional[Callable[[str], None]] = print 34 | _start_time: Optional[float] = field(default=None, init=False, repr=False) 35 | 36 | def __post_init__(self) -> None: 37 | """Initialization: add timer to dict of timers""" 38 | if self.name: 39 | self.timers.setdefault(self.name, 0) 40 | 41 | def start(self) -> None: 42 | """Start a new timer""" 43 | if self._start_time is not None: 44 | raise TimerError(f"Timer is running. Use .stop() to stop it") 45 | self.logger("Start a new timer: {n}".format(n=self.name)) 46 | self._start_time = time.perf_counter() 47 | 48 | def stop(self) -> float: 49 | """Stop the timer, and report the elapsed time""" 50 | if self._start_time is None: 51 | raise TimerError(f"Timer is not running. Use .start() to start it") 52 | 53 | # Calculate elapsed time 54 | elapsed_time = time.perf_counter() - self._start_time 55 | self._start_time = None 56 | 57 | # Report elapsed time 58 | if self.logger: 59 | self.logger(self.text.format(n=self.name, t=elapsed_time)) 60 | if self.name: 61 | self.timers[self.name] += elapsed_time 62 | 63 | return elapsed_time 64 | 65 | def __enter__(self) -> "Timer": 66 | """Start a new timer as a context manager""" 67 | self.start() 68 | return self 69 | 70 | def __exit__(self, *exc_info: Any) -> None: 71 | """Stop the context manager timer""" 72 | self.stop() -------------------------------------------------------------------------------- /pipeline/tmp/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/tmp/.placeholder -------------------------------------------------------------------------------- /scripts/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/scripts/.placeholder -------------------------------------------------------------------------------- /scripts/certs/graphcast_jobs/.env.sample: -------------------------------------------------------------------------------- 1 | NGINX_CONFIG="/etc/nginx/sites-enabled/graph.cast.k3l.io" 2 | WORK_DIR="/home/ubuntu/graphcast_jobs" 3 | REMOTE_USER="ubuntu" 4 | REMOTE_DIR="/home/ubuntu/graphcast_jobs/" 5 | SSH_PRIV_KEY="/home/ubuntu/.ssh/id_graphcast_jobs" 6 | 7 | -------------------------------------------------------------------------------- /scripts/certs/graphcast_jobs/README.md: -------------------------------------------------------------------------------- 1 | We use letsencrypt to issue SSL certs for our domains. 2 | 3 | # Step 1. graph.castN.k3l.io 4 | 5 | Example, graph.cast9.k3l.io. This sub-domain is not load-balanced but is very useful when we want to simulate a blue-green deployment. Also, setting up this sub-domain also makes the next step simple. 6 | 7 | A typical crontab to both **install** as well as **renew** certs looks like this: 8 | ``` 9 | 1 0 */7 * * sudo certbot run --nginx -d graph.cast9.k3l.io -m ops@karma3labs.com --agree-tos -n 10 | ``` 11 | This crontab assumes that `/etc/nginx/sites-available/` is aleady configured for the sub-domain name. 12 | 13 | This repo has a sample nginx file that you can use. **REMEMBER** to replace `N` with your preferred number. 14 | Also, **REMEMBER** to soft link the config file `sudo ln -s /etc/nginx/sites-available/graph.castN.k3l.io /etc/nginx/sites-enabled/` 15 | **NOTE** the sample file does not have ssl config because certbot will add the appropriate config when certbot is run for the first time `sudo certbot run --nginx -d graph.castN.k3l.io -m ops@karma3labs.com --agree-tos -n` 16 | 17 | # Step 2. graph.cast.k3l.io 18 | The sub-domain `graph.cast.k3l.io` is load-balanced across multiple machines. When renewing certs, we cannot have certs renewed from multiple machines and have them invalidate the others. So, we renew certs on 1 machine and push the cert to all the other machines. 19 | 20 | The `install_certs.sh` script takes care of renewing the cert while `push_certs.sh` pushes the cert to the other machines. 21 | 22 | #### Pre-req 23 | `/etc/nginx/sites-available/` should have a config for `graph.cast.k3l.io` 24 | 25 | This repo has a sample nginx file that you can use. **REMEMBER** to replace `CHANGME_OPENSSL_RAND_KEY` with a strong api key. Also, **REMEMBER** to soft link the config file `sudo ln -s /etc/nginx/sites-available/graph.cast.k3l.io /etc/nginx/sites-enabled/` 26 | 27 | #### Cronjobs 28 | A typical crontab on the **"primary"** host looks like this: 29 | ``` 30 | 15 0 */7 * * sudo certbot run --nginx -d graph.cast.k3l.io -m ops@karma3labs.com --agree-tos -n >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; sudo nginx -s reload >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; date >> /var/log/farcaster-graph/graphcast_jobs.log ; cd /home/ubuntu/graphcast_jobs; ./push_certs.sh -h 162.55.109.106 >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; 31 | ``` 32 | 1. renew cert `sudo certbot run --nginx -d graph.cast.k3l.io -m ops@karma3labs.com --agree-tos -n` 33 | 2. reload nginx locally to make sure cert is fine `sudo nginx -s reload` 34 | 3. push renewed cert to 162.55.109.106 `./push_certs.sh -h 162.55.109.106` 35 | 36 | And, the crontab on the **"secondary"** host looks like this: 37 | ``` 38 | 30 0 */7 * * date >> /var/log/farcaster-graph/graphcast_jobs.log ; cd /home/ubuntu/graphcast_jobs; ./install_certs.sh >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1 39 | ``` 40 | 1. install cert assuming that graph.cast.k3l.io nginx config already exists and the "primary" server has scp'd over the pem files. -------------------------------------------------------------------------------- /scripts/certs/graphcast_jobs/graph.castN.k3l.io: -------------------------------------------------------------------------------- 1 | # Allow listed IP addresses with no rate limits 2 | geo $limit { 3 | default 1; 4 | 10.0.0.0/8 0; 5 | 127.0.0.1/32 0; 6 | 192.168.0.0/24 0; 7 | } 8 | 9 | map $limit $limit_key { 10 | 0 ""; 11 | 1 $binary_remote_addr; 12 | } 13 | 14 | # Specify 10 MB storage of binary IP addresses to keep track of 1.6 mil addresses 15 | # to limit at 5 requests/second 16 | limit_req_zone $limit_key zone=graph_castN_zone:10m rate=5r/s; 17 | 18 | server { 19 | server_name graph.castN.k3l.io; 20 | 21 | location ~* \.(env|git|bak|config|log|sh).* { 22 | deny all; 23 | return 404; 24 | } 25 | 26 | 27 | location ~ ^/(_pause|_resume) { 28 | return 404; 29 | } 30 | 31 | location / { 32 | # apply rate limit 33 | limit_req zone=graph_castN_zone burst=10; 34 | proxy_pass http://localhost:8000; 35 | proxy_http_version 1.1; 36 | proxy_set_header Upgrade $http_upgrade; 37 | proxy_set_header Connection 'upgrade'; 38 | proxy_set_header Host $host; 39 | proxy_cache_bypass $http_upgrade; 40 | } 41 | 42 | } 43 | 44 | server { 45 | server_name graph.castN.k3l.io; 46 | 47 | location ~* \.(woff|jpg|jpeg|png|gif|ico|css|js)$ { 48 | access_log off; 49 | } 50 | 51 | listen 80; 52 | } 53 | -------------------------------------------------------------------------------- /scripts/certs/graphcast_jobs/install_certs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Function to log messages with a timestamp 6 | log_message() { 7 | echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" 8 | } 9 | 10 | # Source the environment variables from the .env file 11 | if [ -f .env ]; then 12 | source .env 13 | else 14 | log_message "Error: .env file not found." 15 | exit 1 16 | fi 17 | 18 | # Check if CONFIG and WORK_DIR are set 19 | if [ -z "$NGINX_CONFIG" ] || [ -z "$WORK_DIR" ]; then 20 | log_message "Error: CONFIG and WORK_DIR environment variables must be set." 21 | exit 1 22 | fi 23 | 24 | log_message "Starting check_certificates.sh script." 25 | 26 | # Extract the certificate file paths from the Nginx config file 27 | log_message "Extracting certificate file paths from the Nginx config file." 28 | CERT_FILES=$(grep -E 'ssl_certificate|ssl_certificate_key' $NGINX_CONFIG | awk '{print $2}' | tr -d ';') 29 | 30 | # Flag to indicate if any files were moved 31 | FILES_MOVED=false 32 | 33 | # Check and move the files if they exist 34 | for FILE in $CERT_FILES; do 35 | FILE_NAME=$(basename $FILE) 36 | DIR_NAME=$(dirname $FILE) 37 | if [ -f ${WORK_DIR}/${FILE_NAME} ]; then 38 | log_message "Moving ${WORK_DIR}/${FILE_NAME} to $FILE." 39 | sudo mkdir -p $DIR_NAME 40 | sudo mv ${WORK_DIR}/${FILE_NAME} $FILE 41 | FILES_MOVED=true 42 | else 43 | log_message "File ${WORK_DIR}/${FILE_NAME} not found." 44 | fi 45 | done 46 | 47 | # Reload Nginx if any files were moved 48 | if [ "$FILES_MOVED" = true ]; then 49 | log_message "Files moved. Reloading Nginx." 50 | sudo nginx -s reload 51 | else 52 | log_message "No files moved. Nginx reload not required." 53 | fi 54 | 55 | log_message "Script completed." 56 | -------------------------------------------------------------------------------- /scripts/certs/graphcast_jobs/push_certs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts h: flag 4 | do 5 | case "${flag}" in 6 | h) REMOTE_HOST=${OPTARG};; 7 | esac 8 | done 9 | 10 | if [ -z "$REMOTE_HOST" ]; then 11 | echo "Usage: $0 -h [remote_host]" 12 | echo "" 13 | echo "Example: $0 -h 37.27.108.188" 14 | echo "" 15 | echo "Params:" 16 | echo " [remote_host] host to which the pem files have to be copied over to" 17 | echo"" 18 | exit 19 | fi 20 | 21 | 22 | # Function to log messages with a timestamp 23 | log_message() { 24 | echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" 25 | } 26 | 27 | # Source the environment variables from the .env file 28 | if [ -f .env ]; then 29 | log_message "Loading .env file." 30 | source .env 31 | else 32 | log_message "Error: .env file not found." 33 | exit 1 34 | fi 35 | 36 | # Check if NGINX_CONFIG, REMOTE_USER, REMOTE_HOST, and REMOTE_DIR are set 37 | if [ -z "$NGINX_CONFIG" ] || [ -z "$REMOTE_USER" ] || [ -z "$REMOTE_DIR" ] || [ -z "$SSH_PRIV_KEY" ]; then 38 | log_message "Error: NGINX_CONFIG, REMOTE_USER, REMOTE_HOST, REMOTE_DIR and SSH_PRIV_KEY environment variables must be set." 39 | exit 1 40 | fi 41 | 42 | log_message "Starting sync_certificates.sh script." 43 | 44 | # Extract the certificate file paths from the Nginx config file 45 | log_message "Extracting certificate file paths from the Nginx config file." 46 | CERT_FILES=$(grep -E 'ssl_certificate|ssl_certificate_key' $NGINX_CONFIG | awk '{print $2}' | tr -d ';') 47 | 48 | # SCP the certificate files to the remote server 49 | for FILE in $CERT_FILES; do 50 | log_message "Transferring $FILE to ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}." 51 | sudo scp -p -i $SSH_PRIV_KEY $FILE ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR} 52 | done 53 | 54 | log_message "Script completed." 55 | -------------------------------------------------------------------------------- /serve/.dockerignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .env.sample 3 | scratchpad.md -------------------------------------------------------------------------------- /serve/.env.sample: -------------------------------------------------------------------------------- 1 | DB_PASSWORD=password 2 | DB_HOST=host 3 | DB_NAME=postgres 4 | DB_USERNAME=postgres 5 | DB_PORT=5432 6 | 7 | GO_EIGENTRUST_URL=http://localhost:8080 8 | 9 | FOLLOW_GRAPH_PATHPREFIX=./samples/fc_following_fid 10 | ENGAGEMENT_GRAPH_PATHPREFIX=./samples/fc_engagement_fid 11 | NINETYDAYS_GRAPH_PATHPREFIX=./samples/fc_90dv3_fid 12 | 13 | # SWAGGER_BASE_URL='CHANGE THIS AND UNCOMMENT' 14 | # CURA_API_KEY='CHANGE THIS AND UNCOMMENT' 15 | 16 | USE_PANDAS_PERF='True or False ?' 17 | # optional overrides 18 | # LOG_LEVEL=INFO 19 | # LOG_LEVEL_CORE='DEBUG' 20 | # LOGURU_FORMAT='{time:YYYY-MM-DD HH:mm:ss} | {module}:{file}:{function}:{line} | {level} | {message}' 21 | 22 | # POSTGRES_POOL_SIZE=5 23 | # POSTGRES_ECHO=False 24 | # POSTGRES_TIMEOUT_SECS=60 25 | 26 | # EIGENTRUST_ALPHA=0.5 27 | # EIGENTRUST_EPSILON=1.0 28 | # EIGENTRUST_MAX_ITER=50 29 | # EIGENTRUST_FLAT_TAIL=2 30 | # GO_EIGENTRUST_TIMEOUT_MS=3000 31 | 32 | # CURA_API_ENDPOINT=https://cura.network/api 33 | -------------------------------------------------------------------------------- /serve/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /serve/.idea/.name: -------------------------------------------------------------------------------- 1 | farcaster-graph-serve -------------------------------------------------------------------------------- /serve/.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /serve/.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | postgresql 6 | true 7 | org.postgresql.Driver 8 | jdbc:postgresql://localhost:9541/postgres 9 | $ProjectFileDir$ 10 | 11 | 12 | -------------------------------------------------------------------------------- /serve/.idea/data_source_mapping.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /serve/.idea/farcaster-graph-serve.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /serve/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /serve/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 77 | -------------------------------------------------------------------------------- /serve/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /serve/.idea/sqldialects.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /serve/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /serve/.idea/watcherTasks.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 24 | 25 | -------------------------------------------------------------------------------- /serve/Dockerfile: -------------------------------------------------------------------------------- 1 | # FROM python:3.12-alpine 2 | # not taking the alpine route because packages like psutil don't install without gcc 3 | FROM python:3.12-slim 4 | 5 | RUN pip install --upgrade pip 6 | 7 | RUN pip install poetry 8 | 9 | # single app container 10 | # no need to create virtual envs 11 | # install dependencies into the systems python environment 12 | ENV POETRY_VERSION=1.7.1 \ 13 | POETRY_NO_INTERACTION=1 \ 14 | POETRY_VIRTUALENVS_CREATE=false 15 | 16 | WORKDIR /code 17 | 18 | COPY pyproject.toml poetry.lock ./ 19 | COPY README.md ./ 20 | 21 | # we don't want to rebuild all the layers after every app code change 22 | # ignore app code for now 23 | # uncomment the next line if we start using dev/test specific dependencies 24 | # RUN poetry install --without dev,test --no-root 25 | RUN poetry install --no-root 26 | 27 | COPY ./app /code/app 28 | COPY ./static /code/static 29 | COPY .env.docker ./.env 30 | 31 | # install app code, this is the last image layer and has to be rebuilt 32 | # uncomment the next line if we start using dev/test specific dependencies 33 | # RUN poetry install --without dev,test 34 | RUN poetry install --no-root 35 | 36 | EXPOSE 8000 37 | 38 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] 39 | -------------------------------------------------------------------------------- /serve/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/__init__.py -------------------------------------------------------------------------------- /serve/app/dependencies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/dependencies/__init__.py -------------------------------------------------------------------------------- /serve/app/dependencies/cache_db_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from asyncpg.pool import Pool 4 | from loguru import logger 5 | 6 | 7 | async def set_homefeed_for_fid( 8 | fid: int, cids: list[str], offset: int, cache_pool: Pool 9 | ): 10 | 11 | session_data = {"api": "homefeed", "cids": cids, "offset": offset} 12 | session_value = json.dumps(session_data) 13 | key = f"session:{fid}" 14 | 15 | # TODO update db using cache_pool 16 | pass 17 | 18 | 19 | async def get_homefeed_for_fid(fid: int, cache_pool: Pool) -> dict: 20 | 21 | key = f"session:{fid}" 22 | 23 | # TODO get cached data from db using cache_pool 24 | 25 | return {"cids": [], "offset": 0} 26 | -------------------------------------------------------------------------------- /serve/app/dependencies/db_pool.py: -------------------------------------------------------------------------------- 1 | from fastapi import Request 2 | 3 | 4 | # dependency to make it explicit that routers are accessing hidden state 5 | def get_db(request: Request): 6 | return request.state.db_pool 7 | 8 | 9 | def get_cache_db(request: Request): 10 | return request.state.cache_db_pool 11 | -------------------------------------------------------------------------------- /serve/app/dependencies/logging.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import logging 3 | 4 | from fastapi import Request 5 | from loguru import logger 6 | 7 | 8 | async def get_logger(request: Request): 9 | logger.debug(f"{request.method} {request.url}") 10 | logger.debug("Params:") 11 | for name, value in request.path_params.items(): 12 | logger.debug(f"\t{name}: {value}") 13 | logger.debug("Headers:") 14 | for name, value in request.headers.items(): 15 | logger.debug(f"\t{name}: {value}") 16 | 17 | 18 | class InterceptHandler(logging.Handler): 19 | """ 20 | This intercept allows loguru to work with Python's standard logging module. 21 | https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging 22 | """ 23 | 24 | def emit(self, record: logging.LogRecord) -> None: 25 | # Get corresponding Loguru level if it exists. 26 | level: str | int 27 | try: 28 | level = logger.level(record.levelname).name 29 | except ValueError: 30 | level = record.levelno 31 | 32 | # Find caller from where originated the logged message. 33 | frame, depth = inspect.currentframe(), 0 34 | while frame and (depth == 0 or frame.f_code.co_filename == logging.__file__): 35 | frame = frame.f_back 36 | depth += 1 37 | 38 | logger.opt(depth=depth, exception=record.exc_info).log( 39 | level, record.getMessage() 40 | ) 41 | -------------------------------------------------------------------------------- /serve/app/dependencies/memoize_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Tuple 2 | 3 | from memoize.key import KeyExtractor 4 | 5 | 6 | class EncodedMethodNameAndArgsExcludedKeyExtractor(KeyExtractor): 7 | """Encodes method name, args & kwargs to string and uses that as cache entry key. 8 | This KeyExtractor is class-centric and creates same keys for all objects of the same type. 9 | You can exclude args and kwargs by setting 'skip_args' and 'skip_kwargs' flags. 10 | 11 | Note: If wrapped function is a method (has 'self' as first positional arg) you may want to exclude 'self' from key 12 | by setting 'skip_first_arg_as_self' flag. 13 | For static methods of ordinary functions flag should be set to 'False'. 14 | 15 | Warning: uses method name only, so be cautious and do not wrap methods of different classes with the same names 16 | while using same store and 'skip_first_arg_as_self' set to False.""" 17 | 18 | def __init__( 19 | self, 20 | skip_first_arg_as_self=False, 21 | skip_args: list[int] = [], 22 | skip_kwargs: list[str] = [], 23 | ) -> None: 24 | self._skip_first_arg_as_self = skip_first_arg_as_self 25 | self._skip_args = skip_args 26 | self._skip_kwargs = skip_kwargs 27 | 28 | def format_key( 29 | self, method_reference, call_args: Tuple[Any, ...], call_kwargs: Dict[str, Any] 30 | ) -> str: 31 | if self._skip_args: 32 | call_args = [ 33 | arg for i, arg in enumerate(call_args) if i not in self._skip_args 34 | ] 35 | if self._skip_kwargs: 36 | call_kwargs = { 37 | k: v for k, v in call_kwargs.items() if k not in self._skip_kwargs 38 | } 39 | if self._skip_first_arg_as_self: 40 | call_args.pop(0) 41 | 42 | return str( 43 | ( 44 | method_reference.__name__, 45 | call_args, 46 | call_kwargs, 47 | ) 48 | ) 49 | 50 | def __str__(self) -> str: 51 | return self.__repr__() 52 | 53 | def __repr__(self) -> str: 54 | return ( 55 | f"{self.__class__}" 56 | f"[skip_first_arg_as_self={self._skip_first_arg_as_self}]" 57 | f"[skip_args={self._skip_args}]" 58 | ) 59 | -------------------------------------------------------------------------------- /serve/app/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/models/__init__.py -------------------------------------------------------------------------------- /serve/app/models/channel_model.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, StrEnum 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class ChannelRankingsTimeframe(str, Enum): 7 | LIFETIME = 'lifetime' 8 | SIXTY_DAYS = '60d' 9 | SEVEN_DAYS = '7d' 10 | ONE_DAY = '1d' 11 | 12 | 13 | CHANNEL_RANKING_STRATEGY_NAMES = { 14 | ChannelRankingsTimeframe.LIFETIME: 'channel_engagement', 15 | ChannelRankingsTimeframe.SIXTY_DAYS: '60d_engagement', 16 | ChannelRankingsTimeframe.SEVEN_DAYS: '7d_engagement', 17 | ChannelRankingsTimeframe.ONE_DAY: '1d_engagement', 18 | } 19 | 20 | 21 | class OpenrankCategory(StrEnum): 22 | TEST = 'test' 23 | PROD = 'prod' 24 | 25 | 26 | # Deprecated 27 | class ChannelPointsOrderBy(StrEnum): 28 | TOTAL_POINTS = 'total_points' 29 | DAILY_POINTS = 'daily_points' 30 | 31 | 32 | class ChannelEarningsOrderBy(StrEnum): 33 | TOTAL = 'total' 34 | WEEKLY = 'weekly' 35 | DAILY = 'daily' 36 | LATEST = 'latest' 37 | 38 | 39 | class ChannelEarningsScope(StrEnum): 40 | AIRDROP = 'airdrop' 41 | DAILY = 'daily' 42 | 43 | 44 | class ChannelEarningsType(StrEnum): 45 | POINTS = 'points' 46 | TOKENS = 'tokens' 47 | 48 | 49 | class ChannelFidType(StrEnum): 50 | MEMBER = 'member' 51 | FOLLOWER = 'follower' 52 | -------------------------------------------------------------------------------- /serve/app/models/graph_model.py: -------------------------------------------------------------------------------- 1 | import io 2 | from enum import Enum 3 | from typing import NamedTuple 4 | 5 | import igraph 6 | import pandas 7 | 8 | 9 | class GraphType(Enum): 10 | following = 1 11 | # engagement = 3 12 | # v3engagement = 9 13 | ninetydays = 5 14 | 15 | 16 | class GraphTimeframe(str, Enum): 17 | # lifetime = "lifetime" 18 | ninetydays = "90d" 19 | 20 | 21 | class Graph(NamedTuple): 22 | success_file: str 23 | df: pandas.DataFrame 24 | graph: igraph.Graph 25 | type: GraphType 26 | mtime: float 27 | 28 | def __str__(self): 29 | df_info = io.StringIO() 30 | self.df.info(buf=df_info) 31 | return f""" 32 | type: {self.type} 33 | dataframe: {df_info.getvalue()} 34 | igraph: {self.graph.summary()} 35 | mtime: {self.mtime} 36 | """ 37 | -------------------------------------------------------------------------------- /serve/app/models/score_model.py: -------------------------------------------------------------------------------- 1 | import re 2 | from enum import StrEnum 3 | from typing import NamedTuple, Self 4 | 5 | 6 | class ScoreAgg(StrEnum): 7 | RMS = 'rms' 8 | SUMSQUARE = 'sumsquare' 9 | SUM = 'sum' 10 | SUMCUBEROOT = 'sumcuberoot' 11 | 12 | 13 | class Voting(StrEnum): 14 | SINGLE = 'single' 15 | MULTIPLE = 'multiple' 16 | # TODO 17 | # QUADRATIC = 'quadratic' 18 | 19 | 20 | class QueryType(StrEnum): 21 | SUPERLITE = 'superlite' 22 | LITE = 'lite' 23 | HEAVY = 'heavy' 24 | 25 | 26 | class EngagementType(StrEnum): 27 | V1 = '1.0' 28 | V3 = '2.0' 29 | 30 | 31 | engagement_ids = dict() 32 | engagement_ids[EngagementType.V1] = 3 33 | engagement_ids[EngagementType.V3] = 9 34 | 35 | 36 | class Weights(NamedTuple): 37 | cast: int = 10 38 | recast: int = 5 39 | reply: int = 7 40 | like: int = 1 41 | 42 | @staticmethod 43 | def from_str(weights_str: str) -> Self: 44 | wts = re.search( 45 | r'^([lL](\d{1,2}))?([cC](\d{1,2}))?([rR](\d{1,2}))?([yY](\d{1,2}))?$', 46 | weights_str, 47 | ) 48 | if wts is None: 49 | raise Exception("Invalid weights") 50 | return Weights( 51 | like=0 if wts.group(2) is None else wts.group(2), 52 | cast=0 if wts.group(4) is None else wts.group(4), 53 | recast=0 if wts.group(6) is None else wts.group(6), 54 | reply=0 if wts.group(8) is None else wts.group(8), 55 | ) 56 | -------------------------------------------------------------------------------- /serve/app/routers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/routers/__init__.py -------------------------------------------------------------------------------- /serve/app/routers/token_router.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Sequence 2 | from typing import Self 3 | 4 | from asyncpg import Pool 5 | from eth_typing import ChecksumAddress 6 | from eth_utils import to_bytes, to_checksum_address 7 | from fastapi import APIRouter, Depends, HTTPException, Path, Query 8 | from pydantic import BaseModel, ValidationError, field_validator 9 | 10 | from ..dependencies import db_pool 11 | from ..dependencies.db_utils import get_token_balances 12 | 13 | router = APIRouter(prefix="/{token}", tags=["Token"]) 14 | 15 | 16 | class Token(BaseModel): 17 | """ 18 | Token address. 19 | 20 | TODO(ek) - expand to CAIP-19, to add chain ID and stuff. 21 | """ 22 | 23 | address: ChecksumAddress 24 | 25 | @field_validator("address", mode="before") 26 | def ensure_address(cls, v): 27 | try: 28 | return to_checksum_address(v) 29 | except Exception: 30 | raise ValueError(f"Invalid token address: {v!r}") 31 | 32 | @classmethod 33 | def from_str(cls, v: str) -> Self: 34 | return cls(address=to_checksum_address(v)) 35 | 36 | 37 | def get_token(token: str = Path(description="ERC20 token address")) -> Token: 38 | try: 39 | return Token.from_str(token) 40 | except ValidationError as e: 41 | raise HTTPException(status_code=422, detail=f"Invalid token {token!r}") 42 | 43 | 44 | @router.get("/balances") 45 | async def get_balances( 46 | token: Token = Depends(get_token), 47 | fids: Sequence[int] = Query(..., alias='fid', min_items=1), 48 | pool: Pool = Depends(db_pool.get_db), 49 | ): 50 | rows = await get_token_balances(to_bytes(hexstr=token.address), fids, pool) 51 | balances = {fid: value for fid, value in rows} 52 | return { 53 | "balances": [ 54 | {"fid": fid, "value": str(int(balances.get(fid, 0)))} for fid in fids 55 | ] 56 | } 57 | -------------------------------------------------------------------------------- /serve/app/routers/user_router.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, List, Optional 2 | 3 | from asyncpg.pool import Pool 4 | from fastapi import APIRouter, Depends, Header, Query 5 | from loguru import logger 6 | 7 | from ..dependencies import db_pool, db_utils, graph 8 | 9 | router = APIRouter(tags=["User Labels (Requires API Key)"]) 10 | 11 | 12 | @router.get("/labels/global/top_casters") 13 | async def get_top_global_casters( 14 | x_api_key: Optional[str] = Header(None), # used only for swagger ui 15 | offset: Annotated[int | None, Query()] = 0, 16 | limit: Annotated[int | None, Query(le=1000)] = 100, 17 | pool: Pool = Depends(db_pool.get_db), 18 | ): 19 | """ 20 | Get the top global casters 21 | This API takes optional parameters - 22 | offset and limit 23 | Parameter 'offset' is used to specify how many results to skip 24 | and can be useful for paginating through results. \n 25 | Parameter 'limit' is used to specify the number of results to return. \n 26 | Header 'x-api-key' is used to authenticate the user. Please contact hello@karma3labs.com or https://t.me/Karma3Labs to get the trial API key. \n 27 | """ 28 | 29 | top_casters = await db_utils.get_top_casters(offset=offset, limit=limit, pool=pool) 30 | return {"result": top_casters} 31 | 32 | 33 | @router.get("/labels/global/top_spammers") 34 | async def get_top_global_spammers( 35 | x_api_key: Optional[str] = Header(None), # used only for swagger ui 36 | offset: Annotated[int | None, Query()] = 0, 37 | limit: Annotated[int | None, Query(le=1000)] = 100, 38 | pool: Pool = Depends(db_pool.get_db), 39 | ): 40 | """ 41 | Get the top global spammers 42 | This API takes optional parameters - 43 | offset and limit 44 | Parameter 'offset' is used to specify how many results to skip 45 | and can be useful for paginating through results. \n 46 | Parameter 'limit' is used to specify the number of results to return. \n 47 | Header 'x-api-key' is used to authenticate the user. Please contact hello@karma3labs.com or https://t.me/Karma3Labs to get the trial API key. \n 48 | """ 49 | 50 | top_spammers = await db_utils.get_top_spammers( 51 | offset=offset, limit=limit, pool=pool 52 | ) 53 | return {"result": top_spammers} 54 | -------------------------------------------------------------------------------- /serve/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | farcaster-graph: 3 | build: . 4 | container_name: farcaster-graph 5 | image: farcaster-graph:latest 6 | volumes: 7 | - /home/ubuntu/serve_files:/tmp 8 | environment: 9 | PORT: 8000 10 | ports: 11 | - '8000:8000' 12 | deploy: 13 | resources: 14 | limits: 15 | memory: 64G 16 | restart: unless-stopped 17 | extra_hosts: 18 | - "host.docker.internal:host-gateway" 19 | networks: 20 | - farcaster-network 21 | 22 | networks: 23 | farcaster-network: 24 | name: farcaster-network 25 | external: true -------------------------------------------------------------------------------- /serve/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "serve" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Vijay Mariadassou "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.12" 10 | fastapi = "^0.109.0" 11 | uvicorn = "^0.27.0" 12 | asyncpg = "^0.29.0" 13 | sqlalchemy = "^2.0.25" 14 | loguru = "^0.7.2" 15 | igraph = "^0.11.3" 16 | pydantic-settings = "^2.1.0" 17 | psutil = "^5.9.8" 18 | pandas = {extras = ["performance"], version = "^2.2.2"} 19 | numpy = "^1.26.4" 20 | requests = "^2.31.0" 21 | opentelemetry-distro = "0.43b0" 22 | opentelemetry-instrumentation-fastapi = "0.43b0" 23 | opentelemetry-instrumentation-logging = "0.43b0" 24 | opentelemetry-exporter-otlp = "1.22.0" 25 | prometheus-client = "0.19.0" 26 | asgi-correlation-id = "^4.3.1" 27 | niquests = "^3.14.0" 28 | py-memoize = "^3.1.1" 29 | black = "^25.1.0" 30 | async-lru = "^2.0.5" 31 | isort = "^6.0.1" 32 | eth-typing = "^5.2.1" 33 | eth-utils = "^5.3.0" 34 | eth-hash = {extras = ["pycryptodome"], version = "^0.7.1"} 35 | cashews = {extras = ["diskcache"], version = "^7.4.0"} 36 | 37 | [build-system] 38 | requires = ["poetry-core"] 39 | build-backend = "poetry.core.masonry.api" 40 | 41 | [project] 42 | name = "serve" 43 | version = "0.1.0" 44 | requires-python = ">=3.12" 45 | 46 | [tool.black] 47 | skip-string-normalization = true 48 | -------------------------------------------------------------------------------- /serve/samples/fc_90dv3_fid_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_SUCCESS -------------------------------------------------------------------------------- /serve/samples/fc_90dv3_fid_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_df.pkl -------------------------------------------------------------------------------- /serve/samples/fc_90dv3_fid_ig.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_ig.pkl -------------------------------------------------------------------------------- /serve/samples/fc_engagement_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_SUCCESS -------------------------------------------------------------------------------- /serve/samples/fc_engagement_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_df.pkl -------------------------------------------------------------------------------- /serve/samples/fc_engagement_fid_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_SUCCESS -------------------------------------------------------------------------------- /serve/samples/fc_engagement_fid_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_df.pkl -------------------------------------------------------------------------------- /serve/samples/fc_engagement_fid_ig.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_ig.pkl -------------------------------------------------------------------------------- /serve/samples/fc_engagement_idx.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_idx.pkl -------------------------------------------------------------------------------- /serve/samples/fc_engagement_ig.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_ig.pkl -------------------------------------------------------------------------------- /serve/samples/fc_following_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_SUCCESS -------------------------------------------------------------------------------- /serve/samples/fc_following_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_df.pkl -------------------------------------------------------------------------------- /serve/samples/fc_following_fid_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_SUCCESS -------------------------------------------------------------------------------- /serve/samples/fc_following_fid_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_df.pkl -------------------------------------------------------------------------------- /serve/samples/fc_following_fid_ig.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_ig.pkl -------------------------------------------------------------------------------- /serve/samples/fc_following_idx.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_idx.pkl -------------------------------------------------------------------------------- /serve/samples/fc_following_ig.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_ig.pkl -------------------------------------------------------------------------------- /serve/samples/personal_graph.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/personal_graph.parquet -------------------------------------------------------------------------------- /serve/scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | unset -v tooldir opt 3 | OPTIND=1 4 | while getopts :b: opt 5 | do 6 | case "${opt}" in 7 | '?') echo "unrecognized option -${OPTARG}" >&2; exit 64;; 8 | ':') echo "missing argument for -${OPTARG}" >&2; exit 64;; 9 | b) tooldir="${OPTARG}";; 10 | *) echo "unhandled option -${opt}" >&2; exit 70;; 11 | esac 12 | done 13 | shift $((OPTIND - 1)) 14 | case "${tooldir+set}" in 15 | set) PATH="${tooldir}${PATH+":${PATH}"}"; export PATH;; 16 | esac 17 | case $# in 18 | 0) 19 | set -- . 20 | ;; 21 | esac 22 | isort --profile=black "$@" || exit 23 | black --quiet "$@" || exit 24 | #autopep8 --in-place --aggressive --aggressive --recursive "$@" || exit 25 | -------------------------------------------------------------------------------- /serve/static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/static/favicon.png -------------------------------------------------------------------------------- /sql/counts_by_day.sql: -------------------------------------------------------------------------------- 1 | WITH casts_counts AS ( 2 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS casts_count 3 | FROM casts 4 | GROUP BY DATE_TRUNC('day', timestamp) 5 | ), 6 | links_counts AS ( 7 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS links_count 8 | FROM links 9 | GROUP BY DATE_TRUNC('day', timestamp) 10 | ), 11 | messages_counts AS ( 12 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS messages_count 13 | FROM messages 14 | GROUP BY DATE_TRUNC('day', timestamp) 15 | ), 16 | reactions_counts AS ( 17 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS reactions_count 18 | FROM reactions 19 | GROUP BY DATE_TRUNC('day', timestamp) 20 | ), 21 | user_data_counts AS ( 22 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS user_data_count 23 | FROM user_data 24 | GROUP BY DATE_TRUNC('day', timestamp) 25 | ), 26 | verifications_counts AS ( 27 | SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS verifications_count 28 | FROM verifications 29 | GROUP BY DATE_TRUNC('day', timestamp) 30 | ) 31 | SELECT 32 | COALESCE(casts.day, links.day, messages.day, reactions.day, user_data.day, verifications.day) AS day, 33 | COALESCE(casts_count, 0) AS casts_count, 34 | COALESCE(links_count, 0) AS links_count, 35 | COALESCE(messages_count, 0) AS messages_count, 36 | COALESCE(reactions_count, 0) AS reactions_count, 37 | COALESCE(user_data_count, 0) AS user_data_count, 38 | COALESCE(verifications_count, 0) AS verifications_count 39 | FROM casts_counts casts 40 | FULL OUTER JOIN links_counts links ON casts.day = links.day 41 | FULL OUTER JOIN reactions_counts reactions ON COALESCE(casts.day, links.day) = reactions.day 42 | FULL OUTER JOIN verifications_counts verifications ON COALESCE(casts.day, links.day, reactions.day) = verifications.day 43 | FULL OUTER JOIN messages_counts messages ON COALESCE(casts.day, links.day, reactions.day, verifications.day) = messages.day 44 | FULL OUTER JOIN user_data_counts user_data ON COALESCE(casts.day, links.day, reactions.day, verifications.day, messages.day) = user_data.day 45 | ORDER BY day DESC 46 | LIMIT 1000; 47 | -------------------------------------------------------------------------------- /sql/counts_by_table.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | q_casts AS (SELECT COUNT(1) AS casts FROM casts), 3 | q_chain_events AS (SELECT COUNT(1) AS chain_events FROM chain_events), 4 | q_fids AS (SELECT COUNT(1) AS fids FROM fids), 5 | q_fnames AS (SELECT COUNT(1) AS fnames FROM fnames), 6 | q_links AS (SELECT COUNT(1) AS links FROM links), 7 | q_messages AS (SELECT COUNT(1) AS messages FROM messages), 8 | q_reactions AS (SELECT COUNT(1) AS reactions FROM reactions), 9 | q_signers AS (SELECT COUNT(1) AS signers FROM signers), 10 | q_storage_alloc AS (SELECT COUNT(1) AS storage_alloc FROM storage_allocations), 11 | q_user_data AS (SELECT COUNT(1) AS user_data FROM user_data), 12 | q_username_proofs AS (SELECT COUNT(1) AS username_proofs FROM username_proofs), 13 | q_verifications AS (SELECT COUNT(1) AS verifications FROM verifications) 14 | 15 | SELECT 16 | q_casts.casts, 17 | q_chain_events.chain_events, 18 | q_fids.fids, 19 | q_fnames.fnames, 20 | q_links.links, 21 | q_messages.messages, 22 | q_reactions.reactions, 23 | q_signers.signers, 24 | q_storage_alloc.storage_alloc, 25 | q_user_data.user_data, 26 | q_username_proofs.username_proofs, 27 | q_verifications.verifications 28 | FROM 29 | q_casts, 30 | q_chain_events, 31 | q_fids, 32 | q_fnames, 33 | q_links, 34 | q_messages, 35 | q_reactions, 36 | q_signers, 37 | q_storage_alloc, 38 | q_user_data, 39 | q_username_proofs, 40 | q_verifications; 41 | -------------------------------------------------------------------------------- /sql/neynar-replica/.env.sample: -------------------------------------------------------------------------------- 1 | POSTGRES_HOST=127.0.0.1 2 | POSTGRES_PORT=9541 3 | POSTGRES_USER=postgres 4 | POSTGRES_NAME=postgres 5 | POSTGRES_PASSWORD=CHANGEME 6 | PRIMARY_HOST=135.181.236.185 7 | PRIMARY_PORT=9541 8 | PRIMARY_USER=replica_user 9 | PRIMARY_PASSWORD=CHANGEME 10 | PRIMARY_SLOT_NAME=eigen10 11 | PGDATA=/var/lib/postgresql/data 12 | GID=999 13 | UID= 14 | HOST_VOLUME=/data/pgdata -------------------------------------------------------------------------------- /sql/neynar-replica/Dockerfile: -------------------------------------------------------------------------------- 1 | Dockerfile.noble -------------------------------------------------------------------------------- /sql/neynar-replica/Dockerfile.alpine: -------------------------------------------------------------------------------- 1 | FROM postgres:17.2-alpine 2 | 3 | # Install sudo and configure it for passwordless operation 4 | RUN apk add --no-cache sudo && \ 5 | echo "postgres ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/postgres 6 | 7 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh 8 | RUN chmod +x /usr/local/bin/entrypoint.sh 9 | 10 | # Set the entrypoint script 11 | ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] 12 | -------------------------------------------------------------------------------- /sql/neynar-replica/Dockerfile.noble: -------------------------------------------------------------------------------- 1 | # Use a base image with glibc 2.39 2 | FROM ubuntu:noble 3 | 4 | # Install necessary packages 5 | RUN apt-get update && \ 6 | apt-get install -y sudo curl gnupg lsb-release && \ 7 | apt-get clean && \ 8 | rm -rf /var/lib/apt/lists/*; 9 | 10 | # Install locales 11 | RUN apt-get update && \ 12 | apt-get install -y --no-install-recommends locales && \ 13 | rm -rf /var/lib/apt/lists/*; 14 | RUN echo 'en_US.UTF-8 UTF-8' >> /etc/locale.gen; \ 15 | locale-gen; \ 16 | locale -a | grep 'en_US.utf8' 17 | 18 | # Add PostgreSQL repository 19 | RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \ 20 | curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg 21 | 22 | ARG GID 23 | ARG UID 24 | # Force postgres installation to use uid=999 and gid=999 25 | RUN set -eux; \ 26 | groupadd -r postgres --gid=${GID}; \ 27 | useradd -r -g postgres --uid=${UID} --home-dir=/var/lib/postgresql --shell=/bin/bash postgres; 28 | 29 | # Install PostgreSQL 30 | RUN apt-get update 31 | RUN apt-get install -y postgresql-17 32 | RUN apt-get clean && \ 33 | rm -rf /var/lib/apt/lists/* 34 | 35 | # Set up sudo for postgres user 36 | RUN echo "postgres ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/postgres && \ 37 | chmod 440 /etc/sudoers.d/postgres 38 | 39 | ENV PG_MAJOR=17 40 | ENV PATH=$PATH:/usr/lib/postgresql/$PG_MAJOR/bin 41 | 42 | RUN echo 'Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/17/bin"' \ 43 | >> /etc/sudoers.d/postgres 44 | 45 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh 46 | RUN chmod +x /usr/local/bin/entrypoint.sh 47 | 48 | # Set the entrypoint script 49 | ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] 50 | -------------------------------------------------------------------------------- /sql/neynar-replica/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | postgres: 3 | restart: unless-stopped 4 | container_name: eigen8-replica-postgres 5 | shm_size: '32gb' 6 | build: 7 | context: . 8 | args: 9 | GID: ${GID} 10 | UID: ${UID} 11 | ports: 12 | - '${POSTGRES_PORT}:5432' 13 | environment: 14 | POSTGRES_DB: ${POSTGRES_NAME} 15 | POSTGRES_USER: ${POSTGRES_USER} 16 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 17 | PRIMARY_HOST: ${PRIMARY_HOST} 18 | PRIMARY_PORT: ${PRIMARY_PORT} 19 | PRIMARY_USER: ${PRIMARY_USER} 20 | PRIMARY_PASSWORD: ${PRIMARY_PASSWORD} 21 | PRIMARY_SLOT_NAME: ${PRIMARY_SLOT_NAME} 22 | PGDATA: ${PGDATA} 23 | volumes: 24 | - ${HOST_VOLUME}:/var/lib/postgresql/data 25 | - ${PWD}/postgresql.conf:/usr/local/bin/postgresql.conf 26 | - ${PWD}/pg_hba.conf:/usr/local/bin/pg_hba.conf 27 | healthcheck: 28 | test: ['CMD-SHELL', 'pg_isready --dbname=${POSTGRES_NAME} -U ${PRIMARY_USER}'] 29 | interval: 10s 30 | timeout: 10s 31 | retries: 3 32 | networks: 33 | - farcaster-network 34 | 35 | networks: 36 | farcaster-network: 37 | external: true 38 | name: farcaster-network 39 | 40 | volumes: 41 | postgres-data: 42 | name: neynar-replica 43 | -------------------------------------------------------------------------------- /sql/neynar-replica/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Directory where the data will be stored 5 | DATA_DIR="/var/lib/postgresql/data" 6 | CONF_DIR="/var/lib/postgresql/conf" 7 | 8 | # Ensure environment variables are set 9 | if [ -z "$PRIMARY_HOST" ] || [ -z "$PRIMARY_PORT" ] || [ -z "$PRIMARY_USER" ] || [ -z "$PRIMARY_PASSWORD" ] || [ -z "$PRIMARY_SLOT_NAME" ]; then 10 | echo "Error: Environment variables not set correctly." 11 | exit 1 12 | fi 13 | 14 | # Prepare configuration directory (outside of data directory) 15 | mkdir -p $CONF_DIR 16 | cp /usr/local/bin/postgresql.conf $CONF_DIR/postgresql.conf 17 | cp /usr/local/bin/pg_hba.conf $CONF_DIR/pg_hba.conf 18 | 19 | # Check if the data directory is empty 20 | if [ "$(ls -A $DATA_DIR)" ]; then 21 | echo "Data directory is not empty." 22 | else 23 | echo "Data directory is empty, setting up .pgpass file..." 24 | echo "$PRIMARY_HOST:$PRIMARY_PORT:*:$PRIMARY_USER:$PRIMARY_PASSWORD" > /root/.pgpass 25 | chmod 600 /root/.pgpass 26 | 27 | echo "Initiating base backup..." 28 | pg_config --version 29 | pg_basebackup -h $PRIMARY_HOST -p $PRIMARY_PORT -D $DATA_DIR -U $PRIMARY_USER -vP -w -Xs -R -S $PRIMARY_SLOT_NAME 30 | 31 | # Set the correct permissions 32 | chmod 0700 $DATA_DIR 33 | chown -R postgres:postgres $DATA_DIR 34 | 35 | # Move the customized postgresql.conf back to the data directory 36 | mv $CONF_DIR/postgresql.conf $DATA_DIR/postgresql.conf 37 | mv $CONF_DIR/pg_hba.conf $DATA_DIR/pg_hba.conf 38 | 39 | echo "Backup and configuration complete. Starting PostgreSQL in standby mode." 40 | fi 41 | 42 | 43 | # Start PostgreSQL using sudo 44 | exec sudo -u postgres postgres -D $DATA_DIR 45 | -------------------------------------------------------------------------------- /sql/neynar-replica/postgresql.conf: -------------------------------------------------------------------------------- 1 | listen_addresses = '*' # what IP address(es) to listen on; 2 | port = 5432 # (change requires restart) 3 | max_connections = 400 # (change requires restart) 4 | shared_buffers = 8GB # min 128kB 5 | work_mem = 64MB # min 64kB 6 | maintenance_work_mem = 1GB # min 64kB 7 | dynamic_shared_memory_type = posix # the default is usually the first option 8 | max_worker_processes = 16 # (change requires restart) 9 | wal_level = replica # minimal, replica, or logical 10 | synchronous_commit = local # synchronization level; 11 | wal_log_hints = on # also do full page writes of non-critical updates 12 | wal_compression = on # enables compression of full-page writes; 13 | checkpoint_timeout = 60min # range 30s-1d 14 | max_wal_size = 16GB 15 | min_wal_size = 80MB 16 | max_wal_senders = 10 # max number of walsender processes 17 | hot_standby = on # "off" disallows queries during recovery 18 | wal_receiver_timeout = 5min # time that receiver waits for 19 | random_page_cost = 1.1 # same scale as above 20 | effective_cache_size = 16GB 21 | log_line_prefix = '%m [%p] %q%u@%d ' # special values: 22 | log_timezone = UTC 23 | cluster_name = '17/main' # added to process titles if nonempty 24 | default_transaction_read_only = on 25 | datestyle = 'iso, mdy' 26 | timezone = UTC 27 | shared_preload_libraries = 'pg_stat_statements' # (change requires restart) 28 | -------------------------------------------------------------------------------- /sql/replicator_drop_fk.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE IF EXISTS ONLY public.verifications DROP CONSTRAINT IF EXISTS verifications_hash_foreign; 2 | ALTER TABLE IF EXISTS ONLY public.verifications DROP CONSTRAINT IF EXISTS verifications_fid_foreign; 3 | ALTER TABLE IF EXISTS ONLY public.username_proofs DROP CONSTRAINT IF EXISTS username_proofs_fid_foreign; 4 | ALTER TABLE IF EXISTS ONLY public.user_data DROP CONSTRAINT IF EXISTS user_data_hash_foreign; 5 | ALTER TABLE IF EXISTS ONLY public.user_data DROP CONSTRAINT IF EXISTS user_data_fid_foreign; 6 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_requester_fid_foreign; 7 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_remove_chain_event_id_foreign; 8 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_fid_foreign; 9 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_add_chain_event_id_foreign; 10 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_target_hash_foreign; 11 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_hash_foreign; 12 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_fid_foreign; 13 | ALTER TABLE IF EXISTS ONLY public.messages DROP CONSTRAINT IF EXISTS messages_signer_fid_foreign; 14 | ALTER TABLE IF EXISTS ONLY public.messages DROP CONSTRAINT IF EXISTS messages_fid_foreign; 15 | ALTER TABLE IF EXISTS ONLY public.links DROP CONSTRAINT IF EXISTS links_target_fid_foreign; 16 | ALTER TABLE IF EXISTS ONLY public.links DROP CONSTRAINT IF EXISTS links_fid_foreign; 17 | ALTER TABLE IF EXISTS ONLY public.fnames DROP CONSTRAINT IF EXISTS fnames_fid_foreign; 18 | ALTER TABLE IF EXISTS ONLY public.storage_allocations DROP CONSTRAINT IF EXISTS fids_chain_event_id_foreign; 19 | ALTER TABLE IF EXISTS ONLY public.fids DROP CONSTRAINT IF EXISTS fids_chain_event_id_foreign; 20 | ALTER TABLE IF EXISTS ONLY public.casts DROP CONSTRAINT IF EXISTS casts_hash_foreign; 21 | ALTER TABLE IF EXISTS ONLY public.casts DROP CONSTRAINT IF EXISTS casts_fid_foreign; 22 | --------------------------------------------------------------------------------