├── doc ├── detection-prototype.png ├── 04-validation-approach.pdf ├── 01-datasets-description.pdf ├── 02-detection-system-design.pdf └── 03-legitimate-route-change-identification.pdf ├── .gitignore ├── data ├── caida_as_rel │ ├── query.py │ └── fetch_data.py ├── caida_as_org │ ├── query.py │ └── fetch_data.py ├── bgpstream │ ├── fetch_data.py │ └── locate_route_change.py └── routeviews │ ├── fetch_rib.py │ └── fetch_updates.py ├── post_processor ├── whois_lookup.py ├── rpki_validator.py ├── irr_validator.py ├── html │ └── template_routeviews.html ├── alarm_postprocess_routeviews.py └── summary_routeviews.py ├── BEAM_engine ├── train.py └── BEAM_model.py ├── routing_monitor ├── detect_route_change_routeviews.py └── monitor.py ├── anomaly_detector ├── BEAM_diff_evaluator_routeviews.py ├── report_anomaly_routeviews.py └── utils.py └── readme.md /doc/detection-prototype.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/detection-prototype.png -------------------------------------------------------------------------------- /doc/04-validation-approach.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/04-validation-approach.pdf -------------------------------------------------------------------------------- /doc/01-datasets-description.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/01-datasets-description.pdf -------------------------------------------------------------------------------- /doc/02-detection-system-design.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/02-detection-system-design.pdf -------------------------------------------------------------------------------- /doc/03-legitimate-route-change-identification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/03-legitimate-route-change-identification.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.vim 3 | 4 | data/routeviews/updates 5 | data/routeviews/ribs 6 | data/routeviews/cache 7 | data/routeviews/bgpd 8 | data/caida_as_rel/serial-1 9 | data/caida_as_rel/serial-2 10 | data/caida_as_org/cache 11 | data/caida_as_org/fetched_data 12 | data/bgpstream/cache 13 | data/bgpstream/event 14 | 15 | __pycache__ 16 | 17 | BEAM_engine/models/ 18 | BEAM_engine/models 19 | 20 | routing_monitor/detection_result/ 21 | routing_monitor/detection_result 22 | 23 | post_processor/rpki_cache/ 24 | post_processor/rpki_cache 25 | -------------------------------------------------------------------------------- /data/caida_as_rel/query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | import click 6 | 7 | SCRIPT_DIR = Path(__file__).resolve().parent 8 | 9 | def load(serial, time): 10 | f = SCRIPT_DIR/f"serial-{serial}"/f"{time}.as-rel{'' if serial == '1' else 2}.txt" 11 | 12 | ngbrs = dict() 13 | for line in open(f, "r"): 14 | if line[0] == "#": continue 15 | i, j, k = line.strip().split("|")[:3] 16 | ngbrs.setdefault(i, {-1: set(), 0: set(), 1: set()})[int(k)].add(j) 17 | ngbrs.setdefault(j, {-1: set(), 0: set(), 1: set()})[-int(k)].add(i) 18 | 19 | def query(i, j): 20 | if i not in ngbrs: print(f"Unknown AS: {i}"); return None 21 | if j not in ngbrs: print(f"Unknown AS: {j}"); return None 22 | for k,v in ngbrs[i].items(): 23 | if j in v: return k 24 | return None 25 | 26 | return query 27 | 28 | 29 | @click.command() 30 | @click.option("--serial", "-s", type=click.Choice(["1", "2"]), default="1", help="serial 1 or 2") 31 | @click.option("--time", "-t", type=int, required=True, help="timestamp, e.g., 20200901") 32 | def main(serial, time): 33 | query = load(serial, time) 34 | 35 | while True: 36 | i = input("AS1: ") 37 | j = input("AS2: ") 38 | print(query(i, j)) 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /post_processor/whois_lookup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | import subprocess 5 | from datetime import datetime 6 | from pathlib import Path 7 | import re 8 | 9 | script_dir = Path(__file__).resolve().parent 10 | cache_dir = script_dir/"whois_cache" 11 | cache_dir.mkdir(parents=True, exist_ok=True) 12 | 13 | def whois_lookup(target, cache_date=datetime.now().strftime("%Y-%m-%d")): 14 | cache_file = cache_dir/f"{target.replace('/', '_')}.{cache_date}.txt" 15 | if cache_file.exists(): 16 | with cache_file.open("r", encoding="utf-8") as f: 17 | content = f.read() 18 | else: 19 | try: 20 | result = subprocess.run(["whois", target], 21 | text=True, capture_output=True, check=True) 22 | content = result.stdout 23 | with cache_file.open("w", encoding="utf-8") as f: 24 | f.write(content) 25 | except Exception as e: 26 | print(f"Failed to perform WHOIS lookup for {target}: {e}") 27 | content = "" 28 | return content 29 | 30 | def whois_match(prefix_str, asn_str): 31 | whois_content = whois_lookup(prefix_str) 32 | for line in whois_content.split("\n"): 33 | if not line or line.startswith("%"): continue 34 | match = re.match(r"^(\S+):\s+(.*)$", line) 35 | if match: 36 | _, value = match.groups() 37 | for asn_value in re.findall(r"as\d+", value): 38 | if f"as{asn_str}" == asn_value: 39 | return True 40 | return False 41 | -------------------------------------------------------------------------------- /data/caida_as_org/query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | import click 6 | 7 | SCRIPT_DIR = Path(__file__).resolve().parent 8 | 9 | def load(time): 10 | fname = f"{time}.as-org2info.txt" 11 | lines = open(SCRIPT_DIR/"fetched_data"/fname, "r").readlines() 12 | field1 = "aut|changed|aut_name|org_id|opaque_id|source".split("|") 13 | field2 = "org_id|changed|name|country|source".split("|") 14 | as_info = {} 15 | org_info = {} 16 | for l in lines: 17 | if l[0] == "#": continue 18 | values = l.strip().split("|") 19 | if len(values) == len(field1): 20 | if values[0] in as_info and values[1] < as_info[values[0]]["changed"]: continue 21 | as_info[values[0]] = dict(zip(field1[1:], values[1:])) 22 | if len(values) == len(field2): 23 | if values[0] in org_info and values[1] < org_info[values[0]]["changed"]: continue 24 | org_info[values[0]] = dict(zip(field2[1:], values[1:])) 25 | return as_info, org_info 26 | 27 | @click.command() 28 | @click.option("--time", "-t", type=int, required=True, help="timestamp, like 20200901") 29 | def main(time): 30 | as_info, org_info = load(time) 31 | while True: 32 | inp = input("ASN or org_id: ") 33 | if inp in as_info: 34 | print(f"asn: {inp}, {as_info[inp]}") 35 | elif inp in org_info: 36 | print(f"org_id: {inp}, {org_info[inp]}") 37 | else: 38 | print("no result") 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /data/caida_as_rel/fetch_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | import subprocess 6 | import click 7 | 8 | SCRIPT_DIR = Path(__file__).resolve().parent 9 | 10 | SERIAL_1_DIR = SCRIPT_DIR / "serial-1" 11 | SERIAL_2_DIR = SCRIPT_DIR / "serial-2" 12 | 13 | SERIAL_1_DIR.mkdir(exist_ok=True, parents=True) 14 | SERIAL_2_DIR.mkdir(exist_ok=True, parents=True) 15 | 16 | def get(serial: str, time: int): 17 | if serial == "1": 18 | fname = f"{time}.as-rel.txt.bz2" 19 | obj = f"https://publicdata.caida.org/datasets/as-relationships/serial-1/{fname}" 20 | out = SERIAL_1_DIR / fname 21 | elif serial == "2": 22 | fname = f"{time}.as-rel2.txt.bz2" 23 | obj = f"https://publicdata.caida.org/datasets/as-relationships/serial-2/{fname}" 24 | out = SERIAL_2_DIR / fname 25 | else: 26 | raise RuntimeError("bad argument") 27 | if out.with_suffix("").exists(): 28 | # print(f"as-relationship for {serial} {time} already existed") 29 | return out.with_suffix("") 30 | subprocess.run(["curl", obj, "--output", str(out)], check=True) 31 | subprocess.run(["bzip2", "-d", str(out)], check=True) 32 | print(f"get as-relationship for {serial} {time}") 33 | return out.with_suffix("") 34 | 35 | @click.command() 36 | @click.option("--serial", "-s", type=click.Choice(["1", "2"]), default="1", help="serial 1 or 2") 37 | @click.option("--time", "-t", type=int, required=True, help="timestamp, e.g., 20200901") 38 | def main(serial, time): 39 | get(serial, time) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /BEAM_engine/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | from BEAM_model import BEAM 6 | from shutil import get_terminal_size 7 | import click 8 | import os 9 | 10 | import sys 11 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 12 | 13 | from data.caida_as_rel.fetch_data import get as prepare_edge_file 14 | 15 | @click.command() 16 | @click.option("--serial", "-s", type=click.Choice(["1", "2"]), default="1", help="serial 1 or 2") 17 | @click.option("--time", "-t", type=int, required=True, help="timestamp, e.g., 20200901") 18 | @click.option("--Q", "Q", type=int, default=10, help="hyperparameter Q, e.g., 10") 19 | @click.option("--dimension", type=int, default=128, help="hyperparameter dimension size, e.g., 128") 20 | @click.option("--epoches", type=int, default=1000, help="epoches to train, e.g., 1000") 21 | @click.option("--device", type=int, default=0, help="device to train on") 22 | @click.option("--num-workers", type=int, default=1, help="number of workers") 23 | def main(serial, time, device, **model_params): 24 | os.environ["CUDA_VISIBLE_DEVICES"] = f"{device}" 25 | 26 | edge_file = prepare_edge_file(serial, time) 27 | assert edge_file.exists(), f"fail to prepare {edge_file}" 28 | 29 | model_params["edge_file"] = edge_file 30 | 31 | for k, v in model_params.items(): 32 | print(f"{k}: {v}") 33 | print("*"*get_terminal_size().columns) 34 | # input("Press Enter to start.") 35 | 36 | train_dir = Path(__file__).resolve().parent/"models"/ \ 37 | f"{edge_file.stem}.{model_params['epoches']}.{model_params['Q']}.{model_params['dimension']}" 38 | train_dir.mkdir(parents=True, exist_ok=True) 39 | model_params["train_dir"] = train_dir 40 | epoches = model_params.pop("epoches") 41 | 42 | model = BEAM(**model_params) 43 | model.train(epoches=epoches) 44 | model.save_embeddings(path=str(train_dir)) 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /data/caida_as_org/fetch_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | from urllib.parse import urljoin 6 | import numpy as np 7 | import json 8 | import subprocess 9 | import click 10 | import re 11 | 12 | SCRIPT_DIR = Path(__file__).resolve().parent 13 | CACHE_DIR = SCRIPT_DIR/"cache" 14 | CACHE_DIR.mkdir(parents=True, exist_ok=True) 15 | OUTPUT_DIR = SCRIPT_DIR/"fetched_data" 16 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 17 | 18 | def get_archive_list(refresh=False): 19 | cache_path = CACHE_DIR/f"time2url" 20 | if cache_path.exists() and not refresh: 21 | try: return json.load(open(cache_path, "r")) 22 | except: pass 23 | 24 | url_index = "https://publicdata.caida.org/datasets/as-organizations/" 25 | res = subprocess.check_output(["curl", "-s", url_index]).decode() 26 | res = re.sub(r"\s\s+", " ", res.replace("\n", " ")) 27 | time2url = {} 28 | for fname, time in re.findall(r'\', res): 29 | time2url[time] = urljoin(url_index, fname) 30 | 31 | json.dump(time2url, open(cache_path, "w"), indent=2) 32 | return time2url 33 | 34 | def get_most_recent(time): 35 | time2url = get_archive_list() 36 | times = sorted(time2url.keys()) 37 | idx = np.searchsorted(times, time, "right") 38 | 39 | target_time = times[idx-1] 40 | target_url = time2url[target_time] 41 | 42 | out = OUTPUT_DIR/target_url.split("/")[-1] 43 | if out.with_suffix("").exists(): 44 | # print(f"as-organizations for {target_time} exists") 45 | return target_time, out.with_suffix("") 46 | 47 | subprocess.run(["curl", target_url, "--output", str(out)], check=True) 48 | subprocess.run(["gzip", "-d", str(out)], check=True) 49 | print(f"get as-organizations for {target_time}") 50 | return target_time, out.with_suffix("") 51 | 52 | @click.command() 53 | @click.option("--time", "-t", type=str, required=True, help="timestamp, e.g., 20200901") 54 | def main(time): 55 | get_most_recent(time) 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /routing_monitor/detect_route_change_routeviews.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | import pandas as pd 6 | from datetime import datetime, timedelta 7 | import pickle 8 | import click 9 | 10 | import sys 11 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 12 | 13 | from data.routeviews.fetch_updates import load_updates_to_df, get_all_collectors, get_archive_list, download_data 14 | from monitor import Monitor 15 | 16 | SCRIPT_DIR = Path(__file__).resolve().parent 17 | 18 | def detect(data, route_change_dir, snapshot_dir): 19 | mon = Monitor() 20 | 21 | for fpath in data: 22 | _, date, time = fpath.name.split(".") 23 | 24 | df = load_updates_to_df(fpath) 25 | df = df.sort_values(by="timestamp") 26 | 27 | mon.consume(df, detect=True) 28 | 29 | route_change_df = pd.DataFrame.from_records(mon.route_changes) 30 | mon.route_changes = [] 31 | 32 | route_change_df.to_csv(route_change_dir/f"{date}.{time}.csv", index=False) 33 | 34 | if time == "2345": 35 | pickle.dump(mon, open(snapshot_dir/f"{date}.end-of-the-day", "wb")) 36 | 37 | @click.command() 38 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to use") 39 | @click.option("--year", "-y", type=int, required=True, help="the year to monitor, e.g., 2024") 40 | @click.option("--month", "-m", type=int, required=True, help="the month to monitor, e.g., 8") 41 | def detect_monthly_for(collector, year, month): 42 | result_dir = SCRIPT_DIR/"detection_result"/collector 43 | route_change_dir = result_dir/"route_change" 44 | snapshot_dir = result_dir/"snapshot" 45 | 46 | route_change_dir.mkdir(exist_ok=True, parents=True) 47 | snapshot_dir.mkdir(exist_ok=True, parents=True) 48 | 49 | collectors2url = get_all_collectors() 50 | 51 | d1 = datetime(year=year, month=month, day=1) 52 | d2 = (datetime(year=year, month=month, day=28) + timedelta(days=4) 53 | ).replace(day=1) - timedelta(minutes=15) 54 | 55 | monthly_data = list(map(lambda url: download_data(url, collector), 56 | get_archive_list(collector, collectors2url, d1, d2))) 57 | 58 | detect(monthly_data, route_change_dir, snapshot_dir) 59 | 60 | if __name__ == "__main__": 61 | detect_monthly_for() 62 | -------------------------------------------------------------------------------- /anomaly_detector/BEAM_diff_evaluator_routeviews.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from functools import lru_cache 5 | from pathlib import Path 6 | import pandas as pd 7 | import click 8 | 9 | from utils import load_emb_distance 10 | 11 | repo_dir = Path(__file__).resolve().parent.parent 12 | model_dir = repo_dir/"BEAM_engine"/"models" 13 | 14 | @click.command() 15 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector that the route changes to evaluate are from") 16 | @click.option("--year", "-y", type=int, required=True, help="the year of the route changes monitored, e.g., 2024") 17 | @click.option("--month", "-m", type=int, required=True, help="the month of the route changes monitored, e.g., 8") 18 | @click.option("--beam-model", "-b", type=str, required=True, help="the trained BEAM model to use, e.g., 20240801.as-rel2.1000.10.128") 19 | def evaluate_monthly_for(collector, year, month, beam_model): 20 | collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector 21 | route_change_dir = collector_result_dir/"route_change" 22 | beam_metric_dir = collector_result_dir/"BEAM_metric" 23 | beam_metric_dir.mkdir(exist_ok=True, parents=True) 24 | 25 | emb_dir = model_dir/beam_model 26 | emb_d, dtw_d, path_d, emb, _, _ = load_emb_distance(emb_dir, return_emb=True) 27 | 28 | def dtw_d_only_exist(s, t): 29 | return dtw_d([i for i in s if i in emb], [i for i in t if i in emb]) 30 | 31 | for i in route_change_dir.glob(f"{year}{month:02d}*.csv"): 32 | beam_metric_file = beam_metric_dir/f"{i.stem}.bm.csv" 33 | if beam_metric_file.exists(): continue 34 | 35 | df = pd.read_csv(i) 36 | 37 | path1 = [s.split(" ") for s in df["path1"].values] 38 | path2 = [t.split(" ") for t in df["path2"].values] 39 | 40 | metrics = pd.DataFrame.from_dict({ 41 | "diff": [dtw_d(s,t) for s,t in zip(path1, path2)], 42 | "diff_only_exist": [dtw_d_only_exist(s,t) for s,t in zip(path1, path2)], 43 | "path_d1": [path_d(i) for i in path1], 44 | "path_d2": [path_d(i) for i in path2], 45 | "path_l1": [len(i) for i in path1], 46 | "path_l2": [len(i) for i in path2], 47 | "head_tail_d1": [emb_d(i[0], i[-1]) for i in path1], 48 | "head_tail_d2": [emb_d(i[0], i[-1]) for i in path2], 49 | }) 50 | 51 | metrics.to_csv(beam_metric_file, index=False) 52 | 53 | if __name__ == "__main__": 54 | evaluate_monthly_for() 55 | -------------------------------------------------------------------------------- /routing_monitor/monitor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import ipaddress 3 | 4 | class Monitor: 5 | class Node: 6 | def __init__(self): 7 | self.routes = dict() # forwarder -> aspath 8 | self.left = None 9 | self.right = None 10 | 11 | def get_left(self): 12 | if self.left is None: 13 | self.left = Monitor.Node() 14 | return self.left 15 | 16 | def get_right(self): 17 | if self.right is None: 18 | self.right = Monitor.Node() 19 | return self.right 20 | 21 | def find_route(self, forwarder): 22 | if forwarder in self.routes: 23 | return self.routes[forwarder] 24 | return None 25 | 26 | def __init__(self): 27 | self.root = Monitor.Node() 28 | self.route_changes = [] 29 | 30 | def update(self, timestamp, prefix_str, vantage_point, aspath_str, detect): 31 | prefix = ipaddress.ip_network(prefix_str) 32 | 33 | if prefix.version == 6: return 34 | prefixlen = prefix.prefixlen 35 | prefix = int(prefix[0]) >> (32-prefixlen) 36 | 37 | aspath = aspath_str.split(" ") 38 | forwarder = aspath[0] # NOTE: forwarder could be vantage point, or could not 39 | 40 | n = self.root 41 | original_route = None 42 | for shift in range(prefixlen-1, -1, -1): # find the original route 43 | left = (prefix >> shift) & 1 44 | 45 | if left: n = n.get_left() 46 | else: n = n.get_right() 47 | 48 | if n.find_route(forwarder) is not None: 49 | original_route = [shift, n.find_route(forwarder)] 50 | 51 | if detect and original_route is not None: 52 | shift, original_path = original_route 53 | vict_prefix = ipaddress.ip_network(prefix_str) \ 54 | .supernet(new_prefix=prefixlen-shift) 55 | if aspath != original_path: 56 | self.route_changes.append({ 57 | "timestamp" : timestamp, 58 | "vantage_point": vantage_point, 59 | "forwarder" : forwarder, 60 | "prefix1" : str(vict_prefix), 61 | "prefix2" : prefix_str, 62 | "path1" : " ".join(original_path), 63 | "path2" : " ".join(aspath), 64 | }) 65 | 66 | n.routes[forwarder] = aspath 67 | 68 | def consume(self, df, detect=False): 69 | if "A/W" in df.columns: 70 | df = df.loc[df["A/W"] == "A"] # NOTE: fair move 71 | cols = ["timestamp", "prefix", "peer-asn", "as-path"] 72 | 73 | for a in df[cols].values: 74 | self.update(*a, detect=detect) 75 | -------------------------------------------------------------------------------- /post_processor/rpki_validator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | import requests 4 | import ipaddress 5 | import lzma 6 | from pathlib import Path 7 | import pandas as pd 8 | 9 | script_dir = Path(__file__).resolve().parent 10 | cache_dir = script_dir/"rpki_cache" 11 | cache_dir.mkdir(parents=True, exist_ok=True) 12 | 13 | def fetch_and_uncompress_xz(url, output_path): 14 | if output_path.exists(): 15 | return output_path 16 | try: 17 | temp_xz_file = output_path.with_suffix(output_path.suffix + ".xz") 18 | 19 | response = requests.get(url, stream=True) 20 | response.raise_for_status() 21 | 22 | with temp_xz_file.open("wb") as file: 23 | for chunk in response.iter_content(chunk_size=8192): 24 | file.write(chunk) 25 | 26 | with lzma.open(temp_xz_file, "rb") as xz_file: 27 | with output_path.open("wb") as out_file: 28 | out_file.write(xz_file.read()) 29 | 30 | temp_xz_file.unlink() 31 | 32 | return output_path 33 | 34 | except requests.RequestException as e: 35 | print(f"Error fetching file from {url}: {e}") 36 | except lzma.LZMAError as e: 37 | print(f"Error decompressing the .xz file: {e}") 38 | except Exception as e: 39 | print(f"An unexpected error occurred: {e}") 40 | 41 | # check this out if the current one is down: http://josephine.sobornost.net/ 42 | def sync_cache(year, month, day, source="https://ftp.ripe.net/rpki"): 43 | dfs = [] 44 | for rir in ["apnic", "afrinic", "arin", "lacnic", "ripencc"]: 45 | url = f"{source}/{rir}.tal/{year}/{month:02d}/{day:02d}/roas.csv.xz" 46 | output_path = cache_dir/f"roas-{rir}-{year}{month:02d}{day:02d}.csv" 47 | df = pd.read_csv(fetch_and_uncompress_xz(url, output_path)) 48 | df["TA"] = rir 49 | dfs.append(df) 50 | return pd.concat(dfs, ignore_index=True) 51 | 52 | class RPKI: 53 | class PrefixNode: 54 | def __init__(self): 55 | self.left = None 56 | self.right = None 57 | self.data = [] 58 | 59 | def get_left(self): 60 | if self.left is None: 61 | self.left = RPKI.PrefixNode() 62 | return self.left 63 | 64 | def get_right(self): 65 | if self.right is None: 66 | self.right = RPKI.PrefixNode() 67 | return self.right 68 | 69 | def update_data(self, **kwargs): 70 | self.data.append(kwargs) 71 | 72 | def __init__(self): 73 | self.root = RPKI.PrefixNode() 74 | 75 | def load_data(self, year, month, day): 76 | df = sync_cache(year, month, day) 77 | for _, row in df.iterrows(): 78 | if row["IP Prefix"][-2:] == "/0": continue 79 | directions = self.prefix_to_dirs(row["IP Prefix"]) 80 | if not directions: continue 81 | self.create_node(directions).update_data(**row.to_dict()) 82 | return self 83 | 84 | @staticmethod 85 | def prefix_to_dirs(prefix_str): 86 | prefix = ipaddress.ip_network(prefix_str) 87 | if prefix.version == 6: return None 88 | prefixlen = prefix.prefixlen 89 | prefix = int(prefix[0]) >> (32-prefixlen) 90 | directions = [(prefix>>shift)&1 91 | for shift in range(prefixlen-1, -1, -1)] 92 | return directions 93 | 94 | def create_node(self, directions): 95 | n = self.root 96 | for left in directions: 97 | if left: n = n.get_left() 98 | else: n = n.get_right() 99 | return n 100 | 101 | def match_node(self, directions): 102 | matched = [] 103 | n = self.root 104 | for left in directions: 105 | if left: n = n.get_left() 106 | else: n = n.get_right() 107 | if n is None: break 108 | if n.data: matched += n.data 109 | return matched 110 | 111 | def validate(self, prefix_str, asn_str): 112 | directions = self.prefix_to_dirs(prefix_str) 113 | 114 | if not directions: return "Not Found" 115 | 116 | matched = self.match_node(directions) 117 | 118 | if not matched: return "Not Found" 119 | 120 | for roa in matched: 121 | if int(prefix_str.split("/")[-1]) <= int(roa["Max Length"]) \ 122 | and f"AS{asn_str}" == roa["ASN"]: 123 | return "Valid" 124 | 125 | return "Invalid" 126 | 127 | def all_matched(self, prefix_str): 128 | directions = self.prefix_to_dirs(prefix_str) 129 | 130 | if not directions: return [] 131 | 132 | matched = self.match_node(directions) 133 | 134 | return matched 135 | -------------------------------------------------------------------------------- /data/bgpstream/fetch_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | import numpy as np 6 | from urllib.parse import urljoin 7 | from concurrent.futures import ThreadPoolExecutor 8 | import subprocess 9 | import json 10 | import re 11 | 12 | SCRIPT_DIR = Path(__file__).resolve().parent 13 | CACHE_DIR = SCRIPT_DIR/"cache" 14 | CACHE_DIR.mkdir(parents=True, exist_ok=True) 15 | 16 | url_index="https://bgpstream.crosswork.cisco.com/" 17 | 18 | def get_page(url): 19 | page = subprocess.check_output(["curl", "-s", url]).decode() 20 | return page 21 | 22 | def item_parser(index_page): 23 | events = [] 24 | for item_str in re.finditer(r'\.+?\', index_page, flags=re.DOTALL): 25 | try: 26 | item_str = item_str[0] 27 | item = dict() 28 | for k, v in re.findall(r'\(.+?)\', 29 | item_str, flags=re.DOTALL): 30 | v = re.sub(r"\s\s+", " ", v.replace("\n", " ")).strip() 31 | 32 | if k == "asn": 33 | asns = re.findall(r'\(AS (\d+?)\)', v, flags=re.DOTALL) 34 | 35 | if item["event_type"] == "Outage": 36 | item["asn"] = asns 37 | elif item["event_type"] == "Possible Hijack": 38 | expected, detected = asns 39 | item["expected_asn"] = expected 40 | item["detected_asn"] = detected 41 | elif item["event_type"] == "BGP Leak": 42 | origin, leaker = asns 43 | item["origin_asn"] = origin 44 | item["leaker_asn"] = leaker 45 | else: 46 | raise RuntimeError( 47 | f"Uncovered event_type: {item['event_type']}") 48 | elif k == "country" and v: 49 | item["country"] = v.split(" ")[0] 50 | elif k == "moredetail": 51 | v = re.search(r'\', v) 52 | if v is None: 53 | item["moredetail"] = "" 54 | item["event_id"] = "" 55 | else: 56 | item["moredetail"] = v[1] 57 | item["event_id"] = v[2] 58 | else: 59 | item[k] = v 60 | events.append(item) 61 | except Exception as e: 62 | print(e) 63 | continue 64 | ids = np.array([int(i["event_id"]) for i in events]) 65 | sort_idx = np.argsort(ids) 66 | events = np.array(events)[sort_idx].tolist() 67 | ids = ids[sort_idx].tolist() 68 | return events, ids 69 | 70 | def update_cache(): 71 | index_page = get_page(url_index) 72 | events, ids = item_parser(index_page) 73 | 74 | current_id = [int(i.stem) for i in CACHE_DIR.glob("*.jsonl")] 75 | current_max_id = max(current_id) if current_id else -1 76 | start_idx = np.searchsorted(ids, current_max_id, "right") 77 | 78 | events = events[start_idx:] 79 | if not events: 80 | print("No need to update.") 81 | return 82 | 83 | def fetch_for_detail(ev): 84 | if ev["event_type"] == "Possible Hijack": 85 | detail_page = get_page(urljoin(url_index, ev["moredetail"])) 86 | pattern = r'Expected prefix: (.+?/\d{1,2})' 87 | expected = re.search(pattern, detail_page) 88 | if expected is not None: 89 | ev["expected_prefix"] = expected[1] 90 | else: 91 | print(f"unknown expected_prefix: {ev}") 92 | 93 | pattern = r'Detected advertisement: (.+?/\d{1,2})' 94 | detected = re.search(pattern, detail_page) 95 | if detected is not None: 96 | ev["detected_prefix"] = detected[1] 97 | else: 98 | print(f"unknown detected_prefix: {ev}") 99 | 100 | elif ev["event_type"] == "BGP Leak": 101 | detail_page = get_page(urljoin(url_index, ev["moredetail"])) 102 | pattern = r'Leaked prefix: (.+?/\d{1,2})' 103 | leaked = re.search(pattern, detail_page) 104 | if leaked is not None: 105 | ev["leaked_prefix"] = leaked[1] 106 | else: 107 | print(f"unknown leaked_prefix: {ev}") 108 | 109 | pattern = r'Leaked To:\\s+\(\d+)' 110 | leakedto = re.search(pattern, detail_page) 111 | if leakedto is not None: 112 | ev["leaked_to"] = leakedto[1] 113 | else: 114 | print(f"unknown leaked_to: {ev}") 115 | 116 | with ThreadPoolExecutor(max_workers=128) as executor: 117 | executor.map(fetch_for_detail, events) 118 | 119 | f = open(CACHE_DIR/f"{ids[-1]}.jsonl", "w") 120 | f.write("\n".join([json.dumps(ev) for ev in events])+"\n") 121 | f.close() 122 | 123 | print(f"Update {len(events)} items") 124 | print(f"Latest event_id: {ids[-1]}") 125 | 126 | update_cache() 127 | -------------------------------------------------------------------------------- /anomaly_detector/report_anomaly_routeviews.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | from joblib import Parallel, delayed 6 | from concurrent.futures import ThreadPoolExecutor 7 | from utils import approx_knee_point, event_aggregate 8 | import json 9 | import pandas as pd 10 | import numpy as np 11 | import click 12 | 13 | repo_dir = Path(__file__).resolve().parent.parent 14 | route_change_dir = None 15 | beam_metric_dir = None 16 | 17 | def load_monthly_data(year, month, preprocessor=lambda df: df): 18 | route_change_files = sorted(route_change_dir.glob(f"{year}{month:02d}*.csv")) 19 | beam_metric_files = sorted(beam_metric_dir.glob(f"{year}{month:02d}*.bm.csv")) 20 | datetimes = [i.stem.replace(".","")[:-2] for i in route_change_files] 21 | 22 | bulk_datetimes, bulk_indices = np.unique(datetimes, return_index=True) 23 | bulk_ranges = zip(bulk_indices, bulk_indices[1:].tolist()+[len(datetimes)]) 24 | 25 | def load_one_bulk(i,j): 26 | rc_df = pd.concat(list(map(pd.read_csv, route_change_files[i:j]))) 27 | bm_df = pd.concat(list(map(pd.read_csv, beam_metric_files[i:j]))) 28 | return pd.concat([rc_df, bm_df], axis=1) 29 | 30 | with ThreadPoolExecutor(max_workers=4) as executor: 31 | bulks = list(executor.map( 32 | lambda x: preprocessor(load_one_bulk(*x)), bulk_ranges)) 33 | 34 | return bulk_datetimes, bulks 35 | 36 | def metric_threshold(df, metric_col): 37 | values = df[metric_col] 38 | mu = np.mean(values) 39 | sigma = np.std(values) 40 | metric_th = mu+4*sigma 41 | 42 | print("reference metric: ") 43 | print(values.describe()) 44 | print(f"metric threshold: {metric_th}") 45 | 46 | return metric_th 47 | 48 | def forwarder_threshold(df, event_key): 49 | route_changes = tuple(df.groupby(event_key)) 50 | forwarder_num = [len(j["forwarder"].unique()) for _, j in route_changes] 51 | forwarder_th, cdf = approx_knee_point(forwarder_num) 52 | 53 | print("reference forwarder: ") 54 | print(pd.Series(forwarder_num).describe()) 55 | print(f"forwarder threshold: {forwarder_th}") 56 | 57 | return forwarder_th 58 | 59 | def window(df0, df1, # df0 for reference, df1 for detection 60 | metric="diff", event_key=["prefix1", "prefix2"], 61 | dedup_index=["prefix1", "prefix2", "forwarder", "path1", "path2"]): 62 | 63 | if dedup_index is not None: 64 | df0 = df0.drop_duplicates(dedup_index, keep="first", inplace=False, ignore_index=True) 65 | 66 | with pd.option_context("mode.use_inf_as_na", True): 67 | df0 = df0.dropna(how="any") 68 | 69 | metric_th = metric_threshold(df0, metric) 70 | forwarder_th = forwarder_threshold(df0, event_key) 71 | 72 | events = {} 73 | for key,ev in tuple(df1.groupby(event_key)): 74 | if len(ev["forwarder"].unique()) <= forwarder_th: continue 75 | 76 | ev_sig = ev.sort_values(metric, ascending=False).drop_duplicates("forwarder") 77 | ev_anomaly = ev_sig.loc[ev_sig[metric]>metric_th] 78 | if ev_anomaly.shape[0] <= forwarder_th: continue 79 | 80 | events[key] = ev_anomaly 81 | 82 | if events: 83 | _, df = event_aggregate(events) 84 | n_alarms = len(df['group_id'].unique()) 85 | else: 86 | df = None 87 | n_alarms = 0 88 | 89 | info = dict( 90 | metric=metric, 91 | event_key=event_key, 92 | metric_th=float(metric_th), 93 | forwarder_th=int(forwarder_th), 94 | n_raw_events=len(events), 95 | n_alarms=n_alarms, 96 | ) 97 | 98 | return info, df 99 | 100 | @click.command() 101 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to detect anomalies") 102 | @click.option("--year", "-y", type=int, required=True, help="the year of the route changes monitored, e.g., 2024") 103 | @click.option("--month", "-m", type=int, required=True, help="the month of the route changes monitored, e.g., 8") 104 | def report_alarm_monthly(collector, year, month): 105 | global route_change_dir, beam_metric_dir 106 | collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector 107 | route_change_dir = collector_result_dir/"route_change" 108 | beam_metric_dir = collector_result_dir/"BEAM_metric" 109 | reported_alarm_dir = collector_result_dir/"reported_alarms"/f"{year}{month:02d}" 110 | reported_alarm_dir.mkdir(parents=True, exist_ok=True) 111 | 112 | def preprocessor(df): 113 | df["diff_balance"] = df["diff"]/(df["path_d1"]+df["path_d2"]) 114 | return df 115 | 116 | datetimes, bulks = load_monthly_data(year, month, preprocessor) 117 | indices = np.arange(len(bulks)) 118 | infos = [] 119 | 120 | for i, j in list(zip(indices[:-1], indices[1:])): 121 | info = dict(d0=datetimes[i], d1=datetimes[j]) 122 | _info, df = window(bulks[i], bulks[j], metric="diff_balance") 123 | info.update(**_info) 124 | 125 | if df is None: 126 | info.update(save_path=None) 127 | else: 128 | save_path = reported_alarm_dir/f"{datetimes[i]}_{datetimes[j]}.alarms.csv" 129 | df.to_csv(save_path, index=False) 130 | info.update(save_path=str(save_path)) 131 | 132 | infos.append(info) 133 | 134 | json.dump(infos, open(reported_alarm_dir/f"info_{year}{month:02d}.json", "w"), indent=2) 135 | 136 | if __name__ == "__main__": 137 | report_alarm_monthly() 138 | -------------------------------------------------------------------------------- /post_processor/irr_validator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | import ftplib 5 | import shutil 6 | from pathlib import Path 7 | import gzip 8 | import ipaddress 9 | import re 10 | 11 | script_dir = Path(__file__).resolve().parent 12 | cache_dir = script_dir/"irr_cache" 13 | cache_dir.mkdir(parents=True, exist_ok=True) 14 | 15 | def fetch_and_uncompress_gz(year, month, day, domain="ftp.radb.net"): 16 | output_path = cache_dir/f"radb-{year}{month:02d}{day:02d}.db" 17 | 18 | if output_path.exists(): 19 | return output_path 20 | 21 | temp_gz_file = output_path.with_suffix(output_path.suffix + ".gz") 22 | 23 | ftp = ftplib.FTP(domain) 24 | ftp.login() 25 | 26 | remote_paths = [ 27 | f"/radb/dbase/archive/{year}/radb.db.{year}{month:02d}{day:02d}.gz", 28 | f"/radb/dbase/archive/{year}/radb.db.{str(year)[-2:]}{month:02d}{day:02d}.gz" 29 | ] 30 | 31 | for remote in remote_paths: 32 | try: 33 | with temp_gz_file.open("wb") as file: 34 | ftp.retrbinary(f"RETR {remote}", file.write) 35 | break 36 | except ftplib.error_perm as e: 37 | print(f"Remote file not found at {remote}. Trying the next path...") 38 | except Exception as e: 39 | print(f"Unexpected error while accessing {remote}: {e}") 40 | else: 41 | raise FileNotFoundError("Remote files unavailable.") 42 | 43 | with gzip.open(temp_gz_file, "rb") as f_in, output_path.open("wb") as f_out: 44 | shutil.copyfileobj(f_in, f_out) 45 | 46 | temp_gz_file.unlink() 47 | 48 | return output_path 49 | 50 | def parse_route_blocks(file_path): 51 | with open(file_path, encoding='ISO-8859-1') as file: 52 | content = file.read() 53 | 54 | blocks = content.strip().split('\n\n') 55 | 56 | route_blocks = [] 57 | for block in blocks: 58 | if block.startswith("route:"): 59 | route_dict = parse_route_block(block) 60 | assert route_dict 61 | route_dict["original_data"] = block 62 | route_blocks.append(route_dict) 63 | 64 | return route_blocks 65 | 66 | def parse_route_block(block): 67 | block_dict = {} 68 | current_key = None 69 | 70 | for line in block.split('\n'): 71 | if not line: continue 72 | match = re.match(r'^(\S+):(.*)$', line) 73 | if match: 74 | current_key, value = match.groups() 75 | else: 76 | value = line # multi-line value 77 | assert current_key is not None 78 | block_dict.setdefault(current_key, []).append(value.strip()) 79 | for k, v in block_dict.items(): 80 | block_dict[k] = "\n".join(v) 81 | 82 | return block_dict 83 | 84 | def sync_cache(year, month, day): 85 | route_objects = parse_route_blocks(fetch_and_uncompress_gz(year, month, day)) 86 | return route_objects 87 | 88 | class RADB: 89 | class PrefixNode: 90 | def __init__(self): 91 | self.left = None 92 | self.right = None 93 | self.data = [] 94 | 95 | def get_left(self): 96 | if self.left is None: 97 | self.left = RADB.PrefixNode() 98 | return self.left 99 | 100 | def get_right(self): 101 | if self.right is None: 102 | self.right = RADB.PrefixNode() 103 | return self.right 104 | 105 | def update_data(self, **kwargs): 106 | self.data.append(kwargs) 107 | 108 | def __init__(self): 109 | self.root = RADB.PrefixNode() 110 | 111 | def load_data(self, year, month, day): 112 | route_objects = sync_cache(year, month, day) 113 | for obj in route_objects: 114 | if obj["route"][-2:] == "/0": continue 115 | try: 116 | directions = self.prefix_to_dirs(obj["route"]) 117 | except: 118 | print(obj) 119 | exit() 120 | if not directions: continue 121 | self.create_node(directions).update_data(**obj) 122 | return self 123 | 124 | @staticmethod 125 | def prefix_to_dirs(prefix_str): 126 | prefix = ipaddress.ip_network(prefix_str) 127 | if prefix.version == 6: return None 128 | prefixlen = prefix.prefixlen 129 | prefix = int(prefix[0]) >> (32-prefixlen) 130 | directions = [(prefix>>shift)&1 131 | for shift in range(prefixlen-1, -1, -1)] 132 | return directions 133 | 134 | def create_node(self, directions): 135 | n = self.root 136 | for left in directions: 137 | if left: n = n.get_left() 138 | else: n = n.get_right() 139 | return n 140 | 141 | def match_node(self, directions): 142 | matched = None 143 | n = self.root 144 | for left in directions: 145 | if left: n = n.get_left() 146 | else: n = n.get_right() 147 | if n is None: break 148 | if n.data: matched = n 149 | return matched 150 | 151 | def validate(self, prefix_str, asn_str): 152 | directions = self.prefix_to_dirs(prefix_str) 153 | 154 | if not directions: return "Not Found" 155 | 156 | matched = self.match_node(directions) # longest match 157 | 158 | if matched is None: return "Not Found" 159 | 160 | irrs = matched.data 161 | 162 | for irr in irrs: 163 | if f"AS{asn_str}" == irr["origin"]: 164 | return "Valid" 165 | 166 | return "Invalid" 167 | 168 | def all_matched(self, prefix_str): 169 | directions = self.prefix_to_dirs(prefix_str) 170 | 171 | if not directions: return [] 172 | 173 | matched = self.match_node(directions) # longest match 174 | 175 | if matched is None: return [] 176 | 177 | return matched.data 178 | -------------------------------------------------------------------------------- /data/routeviews/fetch_rib.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | from io import StringIO 6 | from urllib.parse import urljoin 7 | import pandas as pd 8 | import numpy as np 9 | import subprocess 10 | import re 11 | import json 12 | from datetime import datetime 13 | from dateutil.relativedelta import relativedelta 14 | 15 | SCRIPT_DIR = Path(__file__).resolve().parent 16 | CACHE_DIR = SCRIPT_DIR/"cache" 17 | CACHE_DIR.mkdir(parents=True, exist_ok=True) 18 | 19 | current_ym = datetime.now().strftime("%Y.%m") 20 | for cache_file in CACHE_DIR.glob(f"*{current_ym}*"): # remove incomplete cache files 21 | cache_file.unlink() 22 | 23 | def get_all_collectors(url_index="http://routeviews.org/"): 24 | cache_path = CACHE_DIR/f"collectors2url.{url_index.replace('/', '+')}" 25 | if cache_path.exists(): 26 | # print(f"load cache: {cache_path}") 27 | try: return json.load(open(cache_path, "r")) 28 | except: pass 29 | 30 | res = subprocess.check_output(["curl", "-s", url_index]).decode() 31 | res = re.sub(r"\s\s+", " ", res.replace("\n", " ")) 32 | collectors2url = {} 33 | for a, b in re.findall(r'\.+?\([\w\s]+, from (.+?)\)', res): 34 | collector_name = b.split(".")[-3] 35 | if collector_name in collectors2url: 36 | idx = 2 37 | while f"{collector_name}{idx}" in collectors2url: 38 | idx += 1 39 | collector_name = f"{collector_name}{idx}" 40 | collectors2url[collector_name] = urljoin(url_index, a) + "/" 41 | 42 | # print(f"save cache: {cache_path}") 43 | json.dump(collectors2url, open(cache_path, "w"), indent=2) 44 | return collectors2url 45 | 46 | def get_most_recent_rib(collector, collectors2url, dtime): 47 | if collector not in collectors2url: return [] 48 | 49 | def pull_list(): 50 | target_url = urljoin(collectors2url[collector], f"{ym}{subdir}") + "/" 51 | cache_path = CACHE_DIR/f"archive_list.{target_url.replace('/', '+')}" 52 | if cache_path.exists(): 53 | # print(f"load cache: {cache_path}") 54 | try: return target_url, json.load(open(cache_path, "r")) 55 | except: pass 56 | res = subprocess.check_output(["curl", "-s", target_url]).decode() 57 | archive_list = re.findall( 58 | r'\', res) 59 | # print(f"save cache: {cache_path}") 60 | json.dump(archive_list, open(cache_path, "w"), indent=2) 61 | return target_url, archive_list 62 | 63 | ym = dtime.strftime("%Y.%m") 64 | subdir = "/RIBS" 65 | target_url, archive_list = pull_list() 66 | 67 | if not archive_list: 68 | subdir = "" 69 | target_url, archive_list = pull_list() 70 | if not archive_list: return [] 71 | 72 | time_list = ["".join(i[1:]) for i in archive_list] 73 | t = dtime.strftime("%Y%m%d%H%M") 74 | idx = np.searchsorted(time_list, t) 75 | 76 | if idx == 0: 77 | data1 = urljoin(target_url, archive_list[0][0]) 78 | dtime = dtime-relativedelta(months=1) 79 | ym = dtime.strftime("%Y.%m") 80 | target_url, archive_list = pull_list() 81 | if not archive_list: return [] 82 | data0 = urljoin(target_url, archive_list[-1][0]) 83 | stime = datetime.strptime("".join(archive_list[-1][1:]), "%Y%m%d%H%M") 84 | return data0, data1, stime 85 | 86 | if idx == len(time_list): 87 | data0 = urljoin(target_url, archive_list[-1][0]) 88 | stime = datetime.strptime("".join(archive_list[-1][1:]), "%Y%m%d%H%M") 89 | dtime = dtime+relativedelta(months=1) 90 | ym = dtime.strftime("%Y.%m") 91 | target_url, archive_list = pull_list() 92 | if not archive_list: return [] 93 | data1 = urljoin(target_url, archive_list[0][0]) 94 | return data0, data1, stime 95 | 96 | data0 = urljoin(target_url, archive_list[idx-1][0]) 97 | data1 = urljoin(target_url, archive_list[idx][0]) 98 | stime = datetime.strptime("".join(archive_list[idx-1][1:]), "%Y%m%d%H%M") 99 | return data0, data1, stime 100 | 101 | def download_data(url, collector): 102 | fname = url.split("/")[-1].strip() 103 | outpath = SCRIPT_DIR / "ribs" / collector / fname 104 | fpath = outpath.with_suffix("") 105 | if fpath.exists(): 106 | # print(f"updates for {collector} {outpath.stem} already existed") 107 | return fpath 108 | outpath.parent.mkdir(exist_ok=True, parents=True) 109 | subprocess.run(["curl", "-s", url, "--output", str(outpath)], check=True) 110 | subprocess.run(["bzip2", "-d", str(outpath)], check=True) 111 | print(f"get ribs for {collector} {outpath.stem}") 112 | return fpath 113 | 114 | def load_ribs_to_df(fpath): 115 | if fpath.suffix == ".dat": 116 | fd = open(fpath, "r") 117 | l = fd.readline() 118 | while l: 119 | if "Network" in l and "Path" in l: 120 | idx_network = l.find("Network") 121 | idx_path = l.find("Path") 122 | break 123 | l = fd.readline() 124 | 125 | data = [] 126 | current_network = "" 127 | while l: 128 | if l[0] != "*": l = fd.readline(); continue 129 | if l[idx_network] != " ": 130 | current_network = l[idx_network:idx_network+l[idx_network:].find(" ")] 131 | if l[1] == ">": 132 | path = l[idx_path:-3] 133 | if "/" in current_network: 134 | data.append(["0", current_network, path.split(" ")[0], path]) 135 | l = fd.readline() 136 | df = pd.DataFrame(data, columns=["timestamp", "prefix", "peer-asn", "as-path"]) 137 | else: 138 | bgpd = SCRIPT_DIR / 'bgpd' 139 | res = subprocess.check_output([str(bgpd), "-q", "-m", "-u", str(fpath)]).decode() 140 | fmt = "type|timestamp|A/W|peer-ip|peer-asn|prefix|as-path|origin-protocol|next-hop|local-pref|MED|community|atomic-agg|aggregator|unknown-field-1|unknown-field-2" 141 | cols = fmt.split("|") 142 | df = pd.read_csv(StringIO(res), sep="|", names=cols, usecols=cols[:-2], dtype=str, keep_default_na=False) 143 | 144 | return df 145 | -------------------------------------------------------------------------------- /data/routeviews/fetch_updates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | from io import StringIO 6 | from urllib.parse import urljoin 7 | from datetime import datetime 8 | from dateutil.relativedelta import relativedelta 9 | from concurrent.futures import ThreadPoolExecutor 10 | import pandas as pd 11 | import numpy as np 12 | import subprocess 13 | import re 14 | import json 15 | import click 16 | 17 | SCRIPT_DIR = Path(__file__).resolve().parent 18 | CACHE_DIR = SCRIPT_DIR/"cache" 19 | CACHE_DIR.mkdir(parents=True, exist_ok=True) 20 | 21 | current_ym = datetime.now().strftime("%Y.%m") 22 | for cache_file in CACHE_DIR.glob(f"*{current_ym}*"): # remove incomplete cache files 23 | cache_file.unlink() 24 | 25 | def get_all_collectors(url_index="http://routeviews.org/"): 26 | cache_path = CACHE_DIR/f"collectors2url.{url_index.replace('/', '+')}" 27 | if cache_path.exists(): 28 | # print(f"load cache: {cache_path}") 29 | try: return json.load(open(cache_path, "r")) 30 | except: pass 31 | 32 | res = subprocess.check_output(["curl", "-s", url_index]).decode() 33 | res = re.sub(r"\s\s+", " ", res.replace("\n", " ")) 34 | collectors2url = {} 35 | for a, b in re.findall(r'\.+?\([\w\s]+, from (.+?)\)', res): 36 | collector_name = b.split(".")[-3] 37 | if collector_name in collectors2url: 38 | idx = 2 39 | while f"{collector_name}{idx}" in collectors2url: 40 | idx += 1 41 | collector_name = f"{collector_name}{idx}" 42 | collectors2url[collector_name] = urljoin(url_index, a) + "/" 43 | 44 | # print(f"save cache: {cache_path}") 45 | json.dump(collectors2url, open(cache_path, "w"), indent=2) 46 | return collectors2url 47 | 48 | def get_archive_list(collector, collectors2url, dtime1, dtime2): 49 | if collector not in collectors2url: return [] 50 | 51 | def pull_list(ym): 52 | target_url = urljoin(collectors2url[collector], f"{ym}/UPDATES") + "/" 53 | cache_path = CACHE_DIR/f"archive_list.{target_url.replace('/', '+')}" 54 | if cache_path.exists(): 55 | # print(f"load cache: {cache_path}") 56 | try: return target_url, json.load(open(cache_path, "r")) 57 | except: pass 58 | res = subprocess.check_output(["curl", "-s", target_url]).decode() 59 | archive_list = re.findall( 60 | r'\', res) 61 | # print(f"save cache: {cache_path}") 62 | json.dump(archive_list, open(cache_path, "w"), indent=2) 63 | return target_url, archive_list 64 | 65 | ym1 = dtime1.strftime("%Y.%m") 66 | ym2 = dtime2.strftime("%Y.%m") 67 | target_url1, archive_list1 = pull_list(ym1) 68 | target_url2, archive_list2 = pull_list(ym2) 69 | 70 | if not archive_list1 or not archive_list2: 71 | print(f"failed to get archive list: {dtime1} {dtime2}") 72 | exit(1) 73 | 74 | time_list1 = ["".join(i[1:]) for i in archive_list1] 75 | time_list2 = ["".join(i[1:]) for i in archive_list2] 76 | t1 = dtime1.strftime("%Y%m%d%H%M") 77 | t2 = dtime2.strftime("%Y%m%d%H%M") 78 | idx1 = np.searchsorted(time_list1, t1, side="left") 79 | idx2 = np.searchsorted(time_list2, t2, side="right") 80 | 81 | if time_list1 == time_list2: 82 | data = [urljoin(target_url1, i[0]) for i in archive_list1[idx1:idx2]] 83 | else: 84 | data = [urljoin(target_url1, i[0]) for i in archive_list1[idx1:]] 85 | 86 | current_month = datetime(dtime1.year, dtime1.month, 1) 87 | current_month += relativedelta(months=1) 88 | upper_bound = datetime(dtime2.year, dtime2.month, 1) 89 | while current_month < upper_bound: 90 | cur_ym = current_month.strftime("%Y.%m") 91 | cur_target_url, cur_archive_list = pull_list(cur_ym) 92 | data += [urljoin(cur_target_url, i[0]) for i in cur_archive_list] 93 | current_month += relativedelta(months=1) 94 | data += [urljoin(target_url2, i[0]) for i in archive_list2[:idx2]] 95 | 96 | return data 97 | 98 | def download_data(url, collector): 99 | fname = url.split("/")[-1].strip() 100 | outpath = SCRIPT_DIR / "updates" / collector / fname 101 | fpath = outpath.with_suffix("") 102 | if fpath.exists(): 103 | # print(f"updates for {collector} {outpath.stem} already existed") 104 | return fpath 105 | outpath.parent.mkdir(exist_ok=True, parents=True) 106 | subprocess.run(["curl", "-s", url, "--output", str(outpath)], check=True) 107 | subprocess.run(["bzip2", "-d", str(outpath)], check=True) 108 | print(f"get updates for {collector} {outpath.stem}") 109 | return fpath 110 | 111 | def load_updates_to_df(fpath, bgpd=SCRIPT_DIR/"bgpd"): 112 | res = subprocess.check_output([str(bgpd), "-q", "-m", "-u", str(fpath)]).decode() 113 | fmt = "type|timestamp|A/W|peer-ip|peer-asn|prefix|as-path|origin-protocol|next-hop|local-pref|MED|community|atomic-agg|aggregator|unknown-field-1|unknown-field-2" 114 | cols = fmt.split("|") 115 | df = pd.read_csv(StringIO(res), sep="|", names=cols, usecols=cols[:-2], dtype=str, keep_default_na=False) 116 | return df 117 | 118 | 119 | @click.command() 120 | @click.option("--collector", type=str, required=True, help="the collector name, e.g., route-views4") 121 | @click.option("--dtime1", type=str, required=True, help="the starttime (included), e.g., 201812312330") 122 | @click.option("--dtime2", type=str, required=True, help="the endtime (included), e.g., 201812312330") 123 | @click.option("--download", type=bool, default=False, help="download the archives") 124 | @click.option("--num-workers", type=int, default=1, help="number of workers") 125 | def main(collector, dtime1, dtime2, download, num_workers): 126 | dtime1 = datetime.strptime(dtime1, "%Y%m%d%H%M") 127 | dtime2 = datetime.strptime(dtime2, "%Y%m%d%H%M") 128 | 129 | collectors2url = get_all_collectors() 130 | data = get_archive_list(collector, collectors2url, dtime1, dtime2) 131 | # print(data) 132 | 133 | if download: 134 | job = lambda url: download_data(url, collector) 135 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 136 | executor.map(job, data) 137 | 138 | # for url in data: 139 | # fpath = download_data(url, collector) 140 | # print(fpath) 141 | # df = load_updates_to_df(fpath) 142 | # print(df) 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /data/bgpstream/locate_route_change.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | from datetime import datetime, timedelta 6 | import json 7 | import numpy as np 8 | import pandas as pd 9 | import ipaddress 10 | from joblib import Parallel, delayed 11 | 12 | import sys 13 | sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) 14 | 15 | from data.routeviews.fetch_updates import download_data, load_updates_to_df, get_all_collectors, get_archive_list as get_updates_list 16 | from data.routeviews.fetch_rib import load_ribs_to_df, get_most_recent_rib 17 | from routing_monitor.monitor import Monitor 18 | 19 | SCRIPT_DIR = Path(__file__).resolve().parent 20 | CACHE_DIR = SCRIPT_DIR/"cache" 21 | EVENT_DIR = SCRIPT_DIR/"event" 22 | EVENT_DIR.mkdir(parents=True, exist_ok=True) 23 | 24 | collectors2url = get_all_collectors() 25 | 26 | class RibMonitor(Monitor): 27 | def __init__(self, rib_df, checker): 28 | super().__init__() 29 | self.checker = checker 30 | self.consume(rib_df, detect=False) 31 | 32 | def update(self, timestamp, prefix_str, vantage_point, aspath_str, detect): 33 | prefix = ipaddress.ip_network(prefix_str) 34 | 35 | if prefix.version == 6: 36 | prefixlen = prefix.prefixlen 37 | prefix = int(prefix[0]) >> (128-prefixlen) 38 | else: 39 | prefixlen = prefix.prefixlen 40 | prefix = int(prefix[0]) >> (32-prefixlen) 41 | 42 | aspath = aspath_str.split(" ") 43 | forwarder = aspath[0] # forwarder could be vantage point or not 44 | 45 | n = self.root 46 | original_route = None 47 | for shift in range(prefixlen-1, -1, -1): # find the original route 48 | left = (prefix >> shift) & 1 49 | 50 | if left: n = n.get_left() 51 | else: n = n.get_right() 52 | 53 | if n.find_route(forwarder) is not None: 54 | original_route = [shift, n.find_route(forwarder)] 55 | 56 | if detect and original_route is not None: 57 | shift, original_path = original_route 58 | vict_prefix = ipaddress.ip_network(prefix_str) \ 59 | .supernet(new_prefix=prefixlen-shift) 60 | if aspath != original_path: 61 | route_change = { 62 | "timestamp" : timestamp, 63 | "vantage_point": vantage_point, 64 | "forwarder" : forwarder, 65 | "prefix1" : str(vict_prefix), 66 | "prefix2" : prefix_str, 67 | "path1" : " ".join(original_path), 68 | "path2" : " ".join(aspath), 69 | } 70 | if self.checker(route_change): 71 | self.route_changes.append(route_change) 72 | 73 | n.routes[forwarder] = aspath 74 | 75 | def get_event_list(): 76 | existing_ids = [int(i.stem) for i in EVENT_DIR.glob("*.csv")] 77 | last_id = max(existing_ids) if existing_ids else -1 78 | 79 | events = [json.loads(l.strip()) for jl in CACHE_DIR.glob("*.jsonl") 80 | for l in open(jl, "r").readlines()] 81 | events = [ev for ev in events if ev["event_type"] != "Outage"] # NOTE: ignore outage for now 82 | events = np.array(events) 83 | ids = np.array([int(i["event_id"]) for i in events]) 84 | sort_idx = np.argsort(ids) 85 | events = events[sort_idx].tolist() 86 | ids = ids[sort_idx].tolist() 87 | 88 | start_idx = np.searchsorted(ids, last_id, "right") 89 | return events[start_idx:] 90 | 91 | def process_event(collector, event): 92 | if event["event_type"] == "Outage" and len(event["asn"]) != 1: 93 | return # ignore the country-wide outage 94 | 95 | output_dir = EVENT_DIR/event["event_id"] 96 | if (output_dir/f"{collector}.csv").exists(): 97 | return 98 | 99 | dtime1 = datetime.strptime(event["starttime"], "%Y-%m-%d %H:%M:%S") 100 | dtime2 = datetime.strptime(event["endtime"], "%Y-%m-%d %H:%M:%S") \ 101 | if event["endtime"] else dtime1+timedelta(hours=1) 102 | 103 | rib, _, stime = get_most_recent_rib(collector, collectors2url, dtime1) 104 | update_list = get_updates_list(collector, collectors2url, stime, dtime2) 105 | 106 | rib_fpath = download_data(rib, collector) 107 | update_fpaths = [download_data(url, collector) for url in update_list] 108 | 109 | def outage_process(event): 110 | target_asn, = event["asn"] # assert single asn here 111 | def checker(route_change): 112 | p1 = route_change["path1"].split(" ") 113 | p2 = route_change["path2"].split(" ") 114 | return ((target_asn in p1) and (target_asn not in p2)) \ 115 | or ((target_asn not in p1) and (target_asn in p2)) 116 | def locator(df): 117 | return df 118 | return checker, locator 119 | 120 | def hijack_process(event): 121 | expected_asn = event["expected_asn"] 122 | detected_asn = event["detected_asn"] 123 | def checker(route_change): 124 | o1 = route_change["path1"].split(" ")[-1] 125 | o2 = route_change["path2"].split(" ")[-1] 126 | return (expected_asn == o1 and detected_asn == o2) \ 127 | or (detected_asn == o1 and expected_asn == o2) 128 | expected_prefix = event["expected_prefix"] 129 | detected_prefix = event["detected_prefix"] 130 | def locator(df): 131 | return df.loc[(df["prefix"] == expected_prefix) 132 | | (df["prefix"] == detected_prefix)] 133 | return checker, locator 134 | 135 | def leak_process(event): 136 | origin = event["origin_asn"] 137 | leaker = event["leaker_asn"] 138 | leakedto = event["leaked_to"] 139 | def checker(route_change): 140 | p1 = route_change["path1"].split(" ") 141 | p2 = route_change["path2"].split(" ") 142 | return origin == p1[-1] and origin == p2[-1] \ 143 | and ((leaker in p1 and leakedto in p1) 144 | or (leaker in p2 and leakedto in p2)) 145 | leaked_prefix = event["leaked_prefix"] 146 | def locator(df): 147 | return df.loc[df["prefix"] == leaked_prefix] 148 | return checker, locator 149 | 150 | if event["event_type"] == "Outage": 151 | checker, locator = outage_process(event) 152 | elif event["event_type"] == "Possible Hijack": 153 | checker, locator = hijack_process(event) 154 | elif event["event_type"] == "BGP Leak": 155 | checker, locator = leak_process(event) 156 | 157 | rib_df = locator(load_ribs_to_df(rib_fpath)) 158 | 159 | mon = RibMonitor(rib_df, checker) 160 | for fp in update_fpaths: 161 | df = locator(load_updates_to_df(fp)) 162 | mon.consume(df, detect=True) 163 | 164 | route_change_df = pd.DataFrame.from_records(mon.route_changes) 165 | 166 | output_dir.mkdir(exist_ok=True, parents=True) 167 | route_change_df.to_csv(output_dir/f"{collector}.csv", index=False) 168 | 169 | events = get_event_list() 170 | np.random.shuffle(events) 171 | 172 | def process_event_safe(collector, event): 173 | try: process_event(collector, event) 174 | except Exception as e: 175 | print(f"{e} at {collector} {event}") 176 | 177 | Parallel(n_jobs=12, backend="multiprocessing", verbose=10)( 178 | delayed(process_event_safe)("route-views4", ev) for ev in events) 179 | 180 | # for ev in events: 181 | # process_event("route-views4", ev) 182 | # print("done") 183 | # input() 184 | -------------------------------------------------------------------------------- /anomaly_detector/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import json 4 | from pathlib import Path 5 | from functools import lru_cache 6 | from scipy.special import softmax 7 | from itertools import chain 8 | from ipaddress import IPv4Network 9 | import pickle 10 | 11 | def read_csv_empty(*args, **kwargs): 12 | try: return pd.read_csv(*args, **kwargs) 13 | except pd.errors.EmptyDataError: return pd.DataFrame() 14 | 15 | def approx_knee_point(x): 16 | x, y = np.unique(x, return_counts=True) 17 | _x = (x-x.min())/(x.max()-x.min()) 18 | _y = y.cumsum()/y.sum() 19 | idx = np.argmax(np.abs(_y-_x)) 20 | return x[idx], _y[idx] 21 | 22 | def load_emb_distance(train_dir, return_emb=False): 23 | train_dir = Path(train_dir) 24 | 25 | node_emb_path = train_dir / "node.emb" 26 | link_emb_path = train_dir / "link.emb" 27 | rela_emb_path = train_dir / "rela.emb" 28 | 29 | node_emb = pickle.load(open(node_emb_path, "rb")) 30 | link_emb = pickle.load(open(link_emb_path, "rb")) 31 | rela_emb = pickle.load(open(rela_emb_path, "rb")) 32 | rela = rela_emb["p2c"] 33 | link = link_emb["p2c"] 34 | link = softmax(link) 35 | 36 | @lru_cache(maxsize=100000) 37 | def _emb_distance(a, b): # could be cluster-like, e.g. '{123,456}' 38 | a = a.strip("{}").split(",")[0] 39 | b = b.strip("{}").split(",")[0] 40 | if a == b: return 0. 41 | if a not in node_emb or b not in node_emb: 42 | return np.inf 43 | xi = node_emb[a] 44 | xj = node_emb[b] 45 | return np.sum((xj-xi)**2*link) + np.abs(np.sum((xj-xi)*rela)) 46 | 47 | def emb_distance(a, b): 48 | return _emb_distance(str(a), str(b)) 49 | 50 | @lru_cache(maxsize=100000) 51 | def _dtw_distance(s, t): 52 | s = [v for i,v in enumerate(s) if i == 0 or v != s[i-1]] 53 | t = [v for i,v in enumerate(t) if i == 0 or v != t[i-1]] 54 | ls, lt = len(s), len(t) 55 | DTW = np.full((ls+1, lt+1), np.inf) 56 | DTW[0,0] = 0. 57 | for i in range(ls): 58 | for j in range(lt): 59 | cost = emb_distance(s[i], t[j]) 60 | DTW[i+1, j+1] = cost + min(DTW[i , j+1], 61 | DTW[i+1, j ], 62 | DTW[i , j ]) 63 | return DTW[ls, lt] 64 | 65 | def dtw_distance(s, t): 66 | return _dtw_distance(tuple(s), tuple(t)) 67 | 68 | @lru_cache(maxsize=100000) 69 | def _path_emb_length(s): 70 | d = np.array([emb_distance(a,b) for a,b in zip(s[:-1], s[1:])]) 71 | d = d[(d > 0) & (d < np.inf)] 72 | return np.nan if d.size == 0 else d.sum() 73 | 74 | def path_emb_length(s): 75 | return _path_emb_length(tuple(s)) 76 | 77 | if return_emb: 78 | return emb_distance, dtw_distance, path_emb_length, node_emb, link, rela 79 | 80 | return emb_distance, dtw_distance, path_emb_length 81 | 82 | def root_cause_localize_2set(df, th=0.95): 83 | set1_asn_cnt, set2_asn_cnt = {}, {} 84 | for i,j in df[["path1", "path2"]].values: 85 | set_i = set(i.split(" ")) 86 | set_j = set(j.split(" ")) 87 | set_ij = set_i - set_j 88 | set_ji = set_j - set_i 89 | for asn in set_ij: 90 | if asn not in set1_asn_cnt: set1_asn_cnt[asn] = 1 91 | else: set1_asn_cnt[asn] += 1 92 | for asn in set_ji: 93 | if asn not in set2_asn_cnt: set2_asn_cnt[asn] = 1 94 | else: set2_asn_cnt[asn] += 1 95 | 96 | set1, cnt1 = list(set1_asn_cnt.keys()), list(set1_asn_cnt.values()) 97 | idx1 = np.argsort(cnt1)[::-1] 98 | set1 = np.array(set1)[idx1] 99 | cnt1 = np.array(cnt1)[idx1] 100 | 101 | set2, cnt2 = list(set2_asn_cnt.keys()), list(set2_asn_cnt.values()) 102 | idx2 = np.argsort(cnt2)[::-1] 103 | set2 = np.array(set2)[idx2] 104 | cnt2 = np.array(cnt2)[idx2] 105 | 106 | rc_1, rc_2 = [], [] 107 | for a,b in zip(set1, cnt1): 108 | if b/df.shape[0] > th: rc_1.append(a) 109 | for a,b in zip(set2, cnt2): 110 | if b/df.shape[0] > th: rc_2.append(a) 111 | 112 | return sorted(rc_1), sorted(rc_2) 113 | 114 | def root_cause_localize_1set(df, th=0.95): 115 | set_asn_cnt = {} 116 | for i,j in df[["path1", "path2"]].values: 117 | set_i = set(i.split(" ")) 118 | set_j = set(j.split(" ")) 119 | set_xor = set_i^set_j 120 | for asn in set_xor: 121 | if asn not in set_asn_cnt: set_asn_cnt[asn] = 1 122 | else: set_asn_cnt[asn] += 1 123 | 124 | set_asn, cnt = list(set_asn_cnt.keys()), list(set_asn_cnt.values()) 125 | idx = np.argsort(cnt)[::-1] 126 | set_asn = np.array(set_asn)[idx] 127 | cnt = np.array(cnt)[idx] 128 | 129 | rc = [] 130 | for a,b in zip(set_asn, cnt): 131 | if b/df.shape[0] > th: rc.append(a) 132 | 133 | return sorted(rc) 134 | 135 | 136 | def link_root_cause(culprit_to_df): 137 | rcs = list(culprit_to_df.keys()) 138 | dfs = list(culprit_to_df.values()) 139 | 140 | def rc_to_set(rc): 141 | culprit_type, culprit_tuple = rc 142 | assert culprit_type in ["Prefix", "AS"] 143 | if culprit_type == "AS": 144 | culprit_set = set(chain(*culprit_tuple)) 145 | else: # must be "Prefix" 146 | culprit_set = {IPv4Network(p) for p in culprit_tuple} 147 | return culprit_type, culprit_set 148 | 149 | def rc_set_related(rc1, rc2): 150 | t1, set1 = rc1 151 | t2, set2 = rc2 152 | if t1 != t2: 153 | return False 154 | if t1 == "AS": 155 | return set1&set2 156 | else: # t1 and t2 must be "Prefix" 157 | for i in set1: 158 | for j in set2: 159 | if i.overlaps(j): # check if they overlap 160 | return True 161 | if i.prefixlen == j.prefixlen: # check if they're two consecutive prefixes 162 | return abs((int(i[0])>>(32-i.prefixlen)) 163 | -(int(j[0])>>(32-j.prefixlen))) <= 1 164 | return False 165 | 166 | pool = list(map(rc_to_set, rcs)) 167 | group_id = [-1]*len(culprit_to_df) 168 | id_group = dict() 169 | next_id = 0 170 | for i in range(len(culprit_to_df)): 171 | if group_id[i] == -1: 172 | group_id[i] = next_id 173 | next_id += 1 174 | id_group[group_id[i]] = [i] 175 | for j in range(i+1, len(culprit_to_df)): 176 | if group_id[j] == group_id[i]: continue 177 | if rc_set_related(pool[i], pool[j]): 178 | if group_id[j] == -1: 179 | group_id[j] = group_id[i] 180 | id_group[group_id[i]].append(j) 181 | else: 182 | to_be_merged = id_group.pop(group_id[j]) 183 | id_group[group_id[i]] += to_be_merged 184 | for k in to_be_merged: group_id[k] = group_id[i] 185 | group_id_set = set(group_id) 186 | group_id_remapping = dict(zip(group_id_set, range(len(group_id_set)))) 187 | for idx, df in enumerate(dfs): 188 | df["group_id"] = group_id_remapping[group_id[idx]] 189 | return id_group, pd.concat(dfs, ignore_index=True) 190 | 191 | def event_aggregate(events): 192 | culprit2eventkey = {} 193 | eventkey2culprit = {} 194 | 195 | for k,v in events.items(): 196 | rc_1, rc_2 = root_cause_localize_2set(v) 197 | rc_3 = root_cause_localize_1set(v) 198 | if rc_1 or rc_2: 199 | culprit = "AS", (tuple(rc_1), tuple(rc_2)) 200 | elif rc_3: 201 | culprit = "AS", (tuple(rc_3),) 202 | else: 203 | culprit = "Prefix", k 204 | culprit2eventkey.setdefault(culprit, set()).add(k) 205 | eventkey2culprit[k] = culprit 206 | 207 | culprit_to_df = {k: pd.concat([events[i] for i in v]) 208 | for k, v in culprit2eventkey.items()} 209 | for k, v in culprit_to_df.items(): 210 | _, culprit_tuple = k 211 | v["culprit"] = json.dumps(culprit_tuple) 212 | rc_groups, df = link_root_cause(culprit_to_df) 213 | 214 | return rc_groups, df 215 | -------------------------------------------------------------------------------- /BEAM_engine/BEAM_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional 4 | import torch.utils.data 5 | import pickle 6 | import time 7 | from itertools import cycle 8 | from pathlib import Path 9 | 10 | class Analyzer(torch.utils.data.Dataset): 11 | 12 | def __init__(self, Q): 13 | super(Analyzer, self).__init__() 14 | self.Q = Q 15 | self.edge_list = [] 16 | 17 | self.asn_list = [] 18 | self.asn2idx = {} 19 | 20 | self.downstreams = [] 21 | self.upstreams = [] 22 | 23 | def read_edge_file(self, edge_file): 24 | def get_index(asn): 25 | if asn not in self.asn2idx: 26 | self.asn2idx[asn] = len(self.asn_list) 27 | self.asn_list.append(asn) 28 | self.downstreams.append(set()) 29 | self.upstreams.append(set()) 30 | return self.asn2idx[asn] 31 | 32 | for line in open(edge_file, "r"): 33 | if line[0] == "#": continue 34 | i, j, k = line.strip().split("|")[:3] 35 | 36 | index_i = get_index(i) 37 | index_j = get_index(j) 38 | 39 | assert index_i != index_j 40 | 41 | if k == "0": 42 | self.edge_list.append((index_i, index_j)) 43 | self.downstreams[index_i].add(index_j) 44 | self.upstreams[index_j].add(index_i) 45 | 46 | self.edge_list.append((index_j, index_i)) 47 | self.downstreams[index_j].add(index_i) 48 | self.upstreams[index_i].add(index_j) 49 | elif k == "-1": 50 | self.edge_list.append((index_i, index_j)) 51 | self.downstreams[index_i].add(index_j) 52 | self.upstreams[index_j].add(index_i) 53 | else: 54 | raise RuntimeError(f"unexpected rel {rel}") 55 | 56 | print(f"nodes: {len(self.asn_list)}") 57 | print(f"edges: {len(self.edge_list)}") 58 | 59 | self.init_sample_method() 60 | 61 | return self 62 | 63 | def init_sample_method(self, eps=0.01): 64 | upstreams = self.upstreams 65 | downstreams = self.downstreams 66 | 67 | global_cycler = cycle(range(len(self.asn_list))) 68 | none_cycler = cycle([None]) 69 | 70 | # providers as tails, thus negative samples 71 | negative_tails = [cycle(u-d) if u-d else none_cycler 72 | for u,d in zip(upstreams, downstreams)] 73 | 74 | # customers as heads, thus negative samples 75 | negative_heads = [cycle(d-u) if d-u else none_cycler 76 | for u,d in zip(upstreams, downstreams)] 77 | 78 | def get_local_tail_negative(head): 79 | return next(negative_tails[head]) 80 | 81 | def get_local_head_negative(tail): 82 | return next(negative_heads[tail]) 83 | 84 | def get_global_tail_negative(head): 85 | for tail_negative in global_cycler: 86 | if tail_negative != head and tail_negative not in downstreams[head]: 87 | return head, tail_negative 88 | 89 | def get_global_head_negative(tail): 90 | for head_negative in global_cycler: 91 | if head_negative != tail and head_negative not in upstreams[tail]: 92 | return head_negative, tail 93 | 94 | bound1 = 0.5-eps 95 | bound2 = 0.5+eps 96 | 97 | def draw_negative_sample(head, tail): 98 | r = np.random.random() 99 | if r < bound1: # try corrupt tail 100 | tail_negative = get_local_tail_negative(head) 101 | if tail_negative: 102 | sample = (head, tail_negative) 103 | else: # try corrupt head 104 | head_negative = get_local_head_negative(tail) 105 | if head_negative: 106 | sample = (head_negative, tail) 107 | else: # global negative sample 108 | sample = get_global_tail_negative(head) 109 | 110 | elif r > bound2: # try corrupt head 111 | head_negative = get_local_head_negative(tail) 112 | if head_negative: 113 | sample = (head_negative, tail) 114 | else: # try corrupt tail 115 | tail_negative = get_local_tail_negative(head) 116 | if tail_negative: 117 | sample = (head, tail_negative) 118 | else: # global negative sample 119 | sample = get_global_head_negative(tail) 120 | 121 | else: # global negative sample 122 | if r < 0.5: 123 | sample = get_global_head_negative(tail) 124 | else: 125 | sample = get_global_tail_negative(head) 126 | 127 | return sample 128 | 129 | self.draw_negative_sample = draw_negative_sample 130 | 131 | def __len__(self): 132 | return len(self.edge_list) * self.Q 133 | 134 | def __getitem__(self, index): 135 | positive_sample = self.edge_list[index // self.Q] 136 | negative_sample = self.draw_negative_sample(*positive_sample) 137 | input_vector = [0, *positive_sample, *negative_sample] 138 | return torch.tensor(input_vector, dtype=torch.int64, requires_grad=False) 139 | 140 | 141 | class BEAM(torch.nn.Module): 142 | 143 | def __init__(self, edge_file, Q=5, dimension=128, train_dir=Path("./"), cuda_device='cuda', num_workers=20): 144 | super(BEAM, self).__init__() 145 | 146 | self.use_cuda = torch.cuda.is_available() 147 | self.device = torch.device(cuda_device if self.use_cuda else 'cpu') 148 | print("device: {}".format(self.device)) 149 | 150 | self.train_dir = train_dir 151 | 152 | self.analyzer = Analyzer(Q).read_edge_file(edge_file) 153 | self.node_embedding = torch.nn.Embedding( 154 | len(self.analyzer.asn_list), dimension) 155 | self.rela_embedding = torch.nn.Embedding(1, dimension) 156 | self.link_embedding = torch.nn.Embedding(1, dimension) 157 | 158 | self.num_workers = num_workers 159 | 160 | def forward(self, batchVector): 161 | idx_k = batchVector[:,0] 162 | link = torch.nn.functional.softmax(self.link_embedding(idx_k), dim=1) 163 | rela = self.rela_embedding(idx_k) 164 | pi = self.node_embedding(batchVector[:,1]) 165 | pj = self.node_embedding(batchVector[:,2]) 166 | ni = self.node_embedding(batchVector[:,3]) 167 | nj = self.node_embedding(batchVector[:,4]) 168 | 169 | # softplus(corrupt - correct) 170 | relaError = torch.sum((nj-ni-pj+pi)*rela, dim=1) # criteria 2 171 | linkError = torch.sum((pj-pi-nj+ni)*(pj-pi+nj-ni)*link, dim=1) 172 | loss = torch.nn.functional.softplus(relaError + linkError) 173 | 174 | return loss 175 | 176 | def train(self, epoches=500): 177 | self.to(device=self.device) 178 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=0) 179 | generator = torch.utils.data.DataLoader( 180 | self.analyzer, batch_size=1024, shuffle=True, 181 | num_workers=self.num_workers) 182 | 183 | for epoch in range(1, epoches + 1): 184 | if epoch%100 == 0: 185 | torch.save({ 186 | "epoch": epoch, 187 | "model_state_dict": self.state_dict(), 188 | "optimizer_state_dict": optimizer.state_dict(), 189 | }, self.train_dir / "checkpoint") 190 | loss = 0.0 191 | tik = time.time() 192 | for batchData in generator: 193 | optimizer.zero_grad() 194 | batchData = batchData.to(device=self.device) 195 | batchLoss = self(batchData).sum() 196 | loss += float(batchLoss) 197 | batchLoss.backward() 198 | optimizer.step() 199 | tok = time.time() 200 | print(f"Epoch{epoch:4d}/{epoches} Loss: {loss:e} Time: {tok-tik:.1f}s") 201 | 202 | def save_embeddings(self, path='.'): 203 | print("save embeddings...") 204 | path = Path(path) 205 | node_keys = self.analyzer.asn_list 206 | rela_keys = ["p2c"] 207 | link_keys = ["p2c"] 208 | 209 | def dump_embedding(keys, tensor, filePath): 210 | if self.use_cuda: 211 | emb = tensor.weight.cpu().data.numpy() 212 | else: 213 | emb = tensor.weight.data.numpy() 214 | emb = dict(zip(keys, emb)) 215 | pickle.dump(emb, open(filePath, 'wb')) 216 | 217 | dump_embedding(node_keys, self.node_embedding, path/'node.emb') 218 | dump_embedding(rela_keys, self.rela_embedding, path/'rela.emb') 219 | dump_embedding(link_keys, self.link_embedding, path/'link.emb') 220 | -------------------------------------------------------------------------------- /post_processor/html/template_routeviews.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | REPLACE_WITH_TITLE 5 | 6 | 7 | 228 | 229 | 304 | 305 | 356 | 357 | 394 | 395 | 396 | 397 | 398 | 399 |

Report

400 | 401 |

Runtime stats

402 | 403 |
404 | 405 | 455 | 456 | REPLACE_WITH_EXPLANATION 457 | 458 |

All alarms

459 | 460 | REPLACE_WITH_SECTIONS 461 | 462 | 478 | 479 | 480 | 481 | -------------------------------------------------------------------------------- /post_processor/alarm_postprocess_routeviews.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | import pandas as pd 6 | import numpy as np 7 | import json 8 | import click 9 | from functools import lru_cache 10 | from datetime import datetime, timedelta 11 | 12 | from rpki_validator import RPKI 13 | from irr_validator import RADB 14 | from whois_lookup import whois_match 15 | 16 | import sys 17 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 18 | from data.caida_as_org.fetch_data import get_most_recent as as_org_file 19 | from data.caida_as_org.query import load as parse_as_org 20 | from data.caida_as_rel.fetch_data import get as as_rel_file 21 | 22 | def get_one_asn(asn): 23 | return asn.strip("{}").split(",")[0] 24 | 25 | def get_recent_monday(date_str): 26 | date = datetime.strptime(date_str, "%Y%m%d").date() 27 | monday = date - timedelta(days=date.weekday()) 28 | return monday 29 | 30 | def load_as_org(time): 31 | time, fpath = as_org_file(time) 32 | as_info, org_info = parse_as_org(time) 33 | 34 | def get_org_id(asn): 35 | if asn not in as_info: 36 | return asn 37 | info = as_info[asn] 38 | return info["opaque_id"] if info["opaque_id"] != "" else info["org_id"] 39 | 40 | def from_same_org(asn1, asn2): 41 | if get_org_id(asn1) == get_org_id(asn2): 42 | return get_org_id(asn1) 43 | else: 44 | return "-" 45 | 46 | def get_asn_country(asn): 47 | org_id = get_org_id(asn) 48 | if org_id in org_info: 49 | return org_info[org_id]["country"] 50 | 51 | return as_info, org_info, from_same_org, get_asn_country 52 | 53 | def load_as_rel(serial, time): 54 | target = as_rel_file(serial, time) 55 | as_rel_map = {} 56 | lines = open(target, "r").readlines() 57 | for l in lines: 58 | if l[0] == "#": continue 59 | as1, as2, rel = l.split("|")[:3] 60 | rel = int(rel) 61 | as_rel_map.setdefault(as1, {-1:set(), 0:set(), 1:set()})[+rel].add(as2) 62 | as_rel_map.setdefault(as2, {-1:set(), 0:set(), 1:set()})[-rel].add(as1) 63 | 64 | def get_as_rel(as1, as2): 65 | if as1 in as_rel_map: 66 | for rel, as_set in as_rel_map[as1].items(): 67 | if as2 in as_set: return rel 68 | return None 69 | 70 | def get_all_ngbrs(asn): 71 | if asn not in as_rel_map: return None 72 | ret = set() 73 | for v in as_rel_map[asn].values(): 74 | ret |= v 75 | return ret 76 | 77 | def have_connection(as1, as2): 78 | if as1 in as_rel_map and as2 in as_rel_map: 79 | for rel,ngbrs in as_rel_map[as1].items(): 80 | if as2 in ngbrs: 81 | return f"rel({rel})" 82 | ret = [] 83 | for rel in [-1, 0, 1]: 84 | if as_rel_map[as1][rel]&as_rel_map[as2][rel]: 85 | ret.append(f"{rel}") 86 | if ret: 87 | return f"ngbr({';'.join(ret)})" 88 | return "-" 89 | 90 | return as_rel_map, get_as_rel, have_connection 91 | 92 | def different_origin_country(path1, path2, get_asn_country): 93 | cty1 = get_asn_country(path1[-1]) 94 | cty2 = get_asn_country(path2[-1]) 95 | if cty1 != cty2: 96 | return f"{cty1};{cty2}" 97 | else: 98 | return "-" 99 | 100 | def have_origin_connection(path1, path2, have_connection): 101 | return have_connection(path1[-1], path2[-1]) 102 | 103 | def have_unknown_asn(path, as_rel_map): 104 | ret = [] 105 | for i in path: 106 | if i not in as_rel_map: 107 | ret.append(str(i)) 108 | if ret: 109 | return ";".join(set(ret)) 110 | return "-" 111 | 112 | def have_reserved_asn(path): 113 | # reserved ASN (last updated: 2024-04-10) 114 | # https://www.iana.org/assignments/as-numbers/as-numbers.xhtml 115 | ret = [] 116 | for i in path: 117 | i = int(i) 118 | if (i == 0 or i == 112 or i == 23456 or 119 | (i >= 64496 and i <= 131071) or 120 | (i >= 153914 and i <= 196607) or 121 | (i >= 216476 and i <= 262143) or 122 | (i >= 274845 and i <= 327679) or 123 | (i >= 329728 and i <= 393215) or 124 | i >= 402333): 125 | ret.append(str(i)) 126 | if ret: 127 | return ";".join(set(ret)) 128 | return "-" 129 | 130 | def non_valley_free_or_none_rel(path, get_as_rel): 131 | none_rel = [] 132 | non_valley_free = False 133 | rel_seq = [] 134 | state = 1 135 | for a,b in zip(path[:-1], path[1:]): 136 | if a == b: 137 | rel_seq.append("x") 138 | continue 139 | r = get_as_rel(a, b) 140 | if r is None: 141 | rel_seq.append("x") 142 | none_rel.append(f"({a} {b})") 143 | continue 144 | rel_seq.append(str(r)) 145 | if state == 1: 146 | state = r 147 | continue 148 | if r != -1: 149 | non_valley_free = True 150 | 151 | if non_valley_free: 152 | non_valley_free = " ".join(rel_seq) 153 | else: 154 | non_valley_free = "-" 155 | 156 | if none_rel: 157 | none_rel = ";".join(set(none_rel)) 158 | else: 159 | none_rel = "-" 160 | 161 | return non_valley_free, none_rel 162 | 163 | def detour_country(path1, path2, get_asn_country): 164 | countries0 = set(filter(lambda x: x is not None, 165 | map(get_asn_country, path1[:-1]))) 166 | countries1 = set(filter(lambda x: x is not None, 167 | map(get_asn_country, path2[:-1]))) 168 | detour_new_country = len(countries1-countries0) > 0 169 | return detour_new_country 170 | 171 | def as_prepend(path): 172 | ret = [] 173 | for asn, cnt in zip(*np.unique(path, return_counts=True)): 174 | if cnt > 1: ret.append(asn) 175 | if ret: 176 | return ";".join(ret) 177 | return "-" 178 | 179 | def origin_different_upstream(path1, path2, get_as_rel): 180 | path1 = [v for i,v in enumerate(path1) if i == 0 or v != path1[i-1]] 181 | path2 = [v for i,v in enumerate(path2) if i == 0 or v != path2[i-1]] 182 | if len(path1) >= 2 and len(path2) >= 2 \ 183 | and path1[-1] == path2[-1] \ 184 | and path1[-2] != path2[-2] \ 185 | and get_as_rel(path1[-2], path1[-1]) is not None \ 186 | and get_as_rel(path2[-2], path2[-1]) is not None: 187 | return f"{path1[-2]};{path2[-2]}" 188 | return "-" 189 | 190 | def origin_rpki_valid(rpki, prefix, path): 191 | return rpki.validate(prefix, path[-1]) 192 | 193 | def origin_irr_valid(radb, prefix, path): 194 | return radb.validate(prefix, path[-1]) 195 | 196 | def origin_whois_match(prefix, path): 197 | return whois_match(prefix, path[-1]) 198 | 199 | def path_superset(path1, path2): 200 | return ",".join(path1) in ",".join(path2) 201 | 202 | @lru_cache(maxsize=10) 203 | def _get_rpki(date): 204 | return RPKI().load_data(date.year, date.month, date.day) 205 | 206 | def get_rpki(date): 207 | return _get_rpki(get_recent_monday(date)) 208 | 209 | @lru_cache(maxsize=10) 210 | def _get_radb(date): 211 | return RADB().load_data(date.year, date.month, date.day) 212 | 213 | def get_radb(date): 214 | return _get_radb(get_recent_monday(date)) 215 | 216 | @click.command() 217 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to postprocess the detection results") 218 | @click.option("--year", "-y", type=int, required=True, help="the year of the detection results, e.g., 2024") 219 | @click.option("--month", "-m", type=int, required=True, help="the month of the detection results, e.g., 8") 220 | def postprocess(collector, year, month): 221 | as_info, org_info, from_same_org, get_asn_country = load_as_org(f"{year}{month:02d}01") 222 | as_rel_map, get_as_rel, have_connection = load_as_rel("1", f"{year}{month:02d}01") 223 | 224 | repo_dir = Path(__file__).resolve().parent.parent 225 | collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector 226 | reported_alarm_dir = collector_result_dir/"reported_alarms"/f"{year}{month:02d}" 227 | info = json.load(open(reported_alarm_dir/f"info_{year}{month:02d}.json", "r")) 228 | flags_dir = reported_alarm_dir.parent/f"{year}{month:02d}.flags" 229 | flags_dir.mkdir(parents=True, exist_ok=True) 230 | 231 | for i in info: 232 | if i["save_path"] is None: continue 233 | df = pd.read_csv(i["save_path"]) 234 | prefix1 = df["prefix1"].values 235 | prefix2 = df["prefix2"].values 236 | path1 = [list(map(get_one_asn, i.split(" "))) for i in df["path1"].values] 237 | path2 = [list(map(get_one_asn, i.split(" "))) for i in df["path2"].values] 238 | 239 | non_valley_free_1, none_rel_1 = np.array(list(map(lambda x: non_valley_free_or_none_rel(x, get_as_rel), path1))).T 240 | non_valley_free_2, none_rel_2 = np.array(list(map(lambda x: non_valley_free_or_none_rel(x, get_as_rel), path2))).T 241 | 242 | rpki = get_rpki(i["d0"][:8]) 243 | radb = get_radb(i["d0"][:8]) 244 | 245 | flags = pd.DataFrame.from_dict({ 246 | "subprefix_change": [p1 != p2 for p1, p2 in zip(prefix1, prefix2)], 247 | "origin_change": [l1[-1] != l2[-1] for l1, l2 in zip(path1, path2)], 248 | "origin_same_org": [from_same_org(l1[-1], l2[-1]) for l1, l2 in zip(path1, path2)], 249 | "origin_country_change": [different_origin_country(l1, l2, get_asn_country) for l1, l2 in zip(path1, path2)], 250 | "origin_connection": [have_origin_connection(l1, l2, have_connection) for l1, l2 in zip(path1, path2)], 251 | "origin_different_upstream": [origin_different_upstream(l1, l2, get_as_rel) for l1, l2 in zip(path1, path2)], 252 | "origin_rpki_1": [origin_rpki_valid(rpki, p, l) for p, l in zip(prefix1, path1)], 253 | "origin_rpki_2": [origin_rpki_valid(rpki, p, l) for p, l in zip(prefix2, path2)], 254 | "origin_irr_1": [origin_irr_valid(radb, p, l) for p, l in zip(prefix1, path1)], 255 | "origin_irr_2": [origin_irr_valid(radb, p, l) for p, l in zip(prefix2, path2)], 256 | "origin_whois_1": [origin_whois_match(p, l) for p, l in zip(prefix1, path1)], 257 | "origin_whois_2": [origin_whois_match(p, l) for p, l in zip(prefix2, path2)], 258 | "unknown_asn_1": [have_unknown_asn(l, as_rel_map) for l in path1], 259 | "unknown_asn_2": [have_unknown_asn(l, as_rel_map) for l in path2], 260 | "reserved_path_1": [have_reserved_asn(l) for l in path1], 261 | "reserved_path_2": [have_reserved_asn(l) for l in path2], 262 | "non_valley_free_1": non_valley_free_1, 263 | "non_valley_free_2": non_valley_free_2, 264 | "none_rel_1": none_rel_1, 265 | "none_rel_2": none_rel_2, 266 | "as_prepend_1": [as_prepend(l) for l in path1], 267 | "as_prepend_2": [as_prepend(l) for l in path2], 268 | "detour_country": [detour_country(l1, l2, get_asn_country) for l1, l2 in zip(path1, path2)], 269 | "path1_in_path2": [path_superset(l1, l2) for l1, l2 in zip(path1, path2)], 270 | "path2_in_path1": [path_superset(l2, l1) for l1, l2 in zip(path1, path2)], 271 | }) 272 | flags.to_csv(flags_dir/f"{Path(i['save_path']).stem}.flags.csv", index=False) 273 | 274 | if __name__ == "__main__": 275 | postprocess() 276 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Semantics-Aware Routing Anomaly Detection System 2 | 3 | A demonstration codebase for the routing anomaly detection system featured in the USENIX Security 2024 [paper](https://www.usenix.org/conference/usenixsecurity24/presentation/chen-yihao), _Learning with Semantics: Towards a Semantics-Aware Routing Anomaly Detection System_, this repo is intended for research and demonstration purposes and may not be ready for production use. A full-featured, open-source version of the system, presumably including a public service, is currently under development; please see the [Future Work](#future-work) section for more information. 4 | 5 | Contact: yh-chen21@mails.tsinghua.edu.cn 6 | 7 | ## System Overview 8 | 9 | 10 | 11 | The system consists of three main modules: 12 | 13 | - **BEAM Engine** (`BEAM_engine/`): Uses AS business relationship data as input to train the BEAM model, which is used to quantify the path difference (abnormality) of route changes. 14 | 15 | - **Routing Monitor** (`routing_monitor/`): Takes BGP update announcements as input and outputs detected route changes. 16 | 17 | - **Anomaly Detector** (`anomaly_detector/`): Performs anomaly detection on the route changes and conducts correlation analysis on detected anomalous routing changes, outputting anomaly alarms. 18 | 19 | A post-processing module (`post_processor/`) is additionally introduced for anomaly inspection and well-formatted HTML reports. 20 | 21 | ## Workflow 22 | 23 | A typical workflow with this codebase is as follows: 24 | 25 | 1. Train the BEAM model. 26 | 2. Detect route changes from a window of routing data. 27 | 3. Use the BEAM model to quantify the path difference of the route changes. 28 | 4. Identify those with abnormally high path difference, aggregate them, and raise alarms. 29 | 5. Generate a formatted anomaly report. 30 | 31 | ## Get Started 32 | 33 | ### 0. Prepare the environment 34 | 35 | - Python (>=3.8) is required, along with necessary packages. GPU and CUDA is recommended for model training. 36 | 37 | Set it up using Anaconda or Miniconda as follows: 38 | 39 | ```bash 40 | conda create -n beam python=3.8 numpy pandas scipy tqdm joblib click pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y 41 | conda activate beam 42 | ``` 43 | 44 | - The [BGPdump tool](https://github.com/RIPE-NCC/bgpdump) is required for parsing MRT-format BGP routing data. 45 | 46 | Build it from source, and link the binary to `$YOUR_REPO_PATH/data/routeviews/bgpd`, as follows: 47 | 48 | ```bash 49 | git clone https://github.com/RIPE-NCC/bgpdump.git 50 | cd bgpdump 51 | sh ./bootstrap.sh 52 | make 53 | ln -s $(pwd)/bgpdump $YOUR_REPO_PATH/data/routeviews/bgpd 54 | $YOUR_REPO_PATH/data/routeviews/bgpd -T # should print the test output 55 | ``` 56 | 57 | ### 1. Train the BEAM model 58 | 59 | Run `BEAM_engine/train.py` for model training. See all available parameters with `--help`. 60 | 61 | An example run is as follows: 62 | 63 | ```bash 64 | python train.py --serial 2 \ 65 | --time 20240801 \ 66 | --Q 10 \ 67 | --dimension 128 \ 68 | --epoches 1000 \ 69 | --device 0 \ 70 | --num-workers 10 71 | ``` 72 | 73 | This example downloads the [CAIDA AS relationship data](https://publicdata.caida.org/datasets/as-relationships/) of serial-2, Aug. 1, 2024, to train a BEAM model for 1000 epoches, with the number of negative samples (`Q`) set to 10 and the embedding vector dimension (`d`) set to 128. Training is executed on device 0 (either CPU or GPU, depending on the machine), and up to 10 parallel workers are used for data processing. 74 | 75 | Notes: 76 | 77 | - The CAIDA AS relationship data is updated monthly. A typical archive today contains approximately 500,000 AS relationship records and is around 1.5MB in size. 78 | 79 | - The required CAIDA data is downloaded upon first use and stored in either `data/caida_as_rel/serial-1/` or `data/caida_as_rel/serial-2/`. Alternatively, other sources can be used if they follow the same format. 80 | 81 | - The trained model is saved in a directory under `BEAM_engine/models/`, named according to the training parameters. This includes the trained embedding vectors (`link.emb`, `node.emb`, `rela.emb`). 82 | 83 | - For reference, on a dual-core Xeon E5-2650v4 with a GeForce RTX 2080 Ti, training for 1000 epoches takes about 10 hours, with peak memory usage within 10GB. 84 | 85 | ### 2. Detect route changes 86 | 87 | Run `routing_monitor/detect_route_change_routeviews.py` for monthly route change detection. See all available parameters with `--help`. 88 | 89 | An example run is as follows: 90 | 91 | ```bash 92 | python detect_route_change_routeviews.py \ 93 | --collector wide \ 94 | --year 2024 \ 95 | --month 8 96 | ``` 97 | 98 | This example downloads and identifies route changes with the BGP update announcements from the `wide` collector of [RouteViews](http://routeviews.org/), for the entire month of August, 2024. 99 | 100 | Notes: 101 | 102 | - RouteViews maintains over 30 collectors, each of which archives BGP update announcements in [MRT format](https://www.rfc-editor.org/rfc/rfc6396) at approximately 15-minute intervals. BGPdump is called as a subprocess to load these data. Other sources of data can also be used if they adhere to the MRT format. 103 | 104 | - The required RouteViews data is downloaded upon first use and stored in a directory under `data/routeviews/updates/`, named after the chosen collector. 105 | 106 | - This script processes the routing data of an entire month sequentially, in an offline manner. A global routing table is maintained in a Trie structure to track the route changes. The results are stored in a directory under `routing_monitor/detection_result/`, named after the chosen collector. The results include the identified route changes and daily snapshots of the global routing table. 107 | 108 | ### 3. Quantify path difference 109 | 110 | Run `anomaly_detector/BEAM_diff_evaluator_routeviews.py` for path difference evaluation on the monthly route changes. See all available parameters with `--help`. 111 | 112 | An example run is as follows: 113 | 114 | ```bash 115 | python BEAM_diff_evaluator_routeviews.py \ 116 | --collector wide \ 117 | --year 2024 \ 118 | --month 8 \ 119 | --beam-model 20240801.as-rel2.1000.10.128 120 | ``` 121 | 122 | This example uses the BEAM model trained in [Step 1](#1-train-the-beam-model) to evaluate the path difference of the route changes detected in [Step 2](#2-detect-route-changes). 123 | 124 | Notes: 125 | 126 | - This script evaluates the path difference for route changes of an entire month sequentially, in an offline manner. The results are stored in `BEAM_metric/`, under the same parent directory as the route change directory of the chosen collector. 127 | 128 | ### 4. Detect anomalies 129 | 130 | Run `anomaly_detector/report_anomaly_routeviews.py` to detect anomalies based on the path difference of route changes. See all available parameters with `--help`. 131 | 132 | An example run is as follows: 133 | 134 | ```bash 135 | python report_anomaly_routeviews.py \ 136 | --collector wide \ 137 | --year 2024 \ 138 | --month 8 139 | ``` 140 | 141 | This example detects anomalies based on the route changes detected in [Step 2](#2-detect-route-changes) and their path difference evaluated in [Step 3](#3-quantify-path-difference). 142 | 143 | Notes: 144 | 145 | - This script detects anomalies for route changes of an entire month sequentially, in an offline manner. The results are stored in `reported_alarms/`, under the same parent directory as the route change directory of the chosen collector. 146 | 147 | - The results include the anomaly alarms raised for each time window, in separate CSV files, as well as a JSON file describing the overall information of the month's detection. Each alarm contains the time window, prefixes, associated ASes, and corresponding anomalous route changes, all associated to a single anomaly. 148 | 149 | ### 5. Generate the report 150 | 151 | Run `post_processor/alarm_postprocess_routeviews.py` to incorporate additional knowledge, e.g., RPKI states, for identifying properties associated with the generated alarms. See all available parameters with `--help`. 152 | 153 | An example run is as follows: 154 | 155 | ```bash 156 | python alarm_postprocess_routeviews.py \ 157 | --collector wide \ 158 | --year 2024 \ 159 | --month 8 160 | ``` 161 | 162 | This example identifies properties associated with the alarms generated in [Step 4](#4-detect-anomalies). 163 | 164 | Notes: 165 | 166 | - This script utilizes additional knowledge to identify several properties associated with the alarms, for better understanding of the anomalies. The results are stored in `reported_alarms.flags/`, under the same parent directory as the route change directory of the chosen collector. 167 | 168 | - Each alarm would be associated with the following properties: 169 | - `subprefix_change`: the alarm includes route changes involving sub-prefixes. 170 | - `origin_change`: the alarm includes route changes involving change of origin ASes. 171 | - `origin_same_org`: the alarm includes origin changes where the different origin ASes are from the same organization. 172 | - `origin_country_change`: the alarm includes origin changes where the different origin ASes are from different countries. 173 | - `origin_connection`: the alarm includes origin changes where the different origin ASes are connected. 174 | - `origin_different_upstream`: the alarm includes route changes where the path go through different upstream providers from the same origin. 175 | - `origin_rpki_1`: the origin before the change is RPKI-valid. 176 | - `origin_rpki_2`: the origin after the change is RPKI-valid. 177 | - `unknown_asn_1`: the route before the change includes unknown ASN. 178 | - `unknown_asn_2`: the route after the change includes unknown ASN. 179 | - `reserved_path_1`: the route before the change includes reserved ASN. 180 | - `reserved_path_2`: the route after the change includes reserved ASN. 181 | - `non_valley_free_1`: the route before the change is non-valley-free. 182 | - `non_valley_free_2`: the route after the change is non-valley-free. 183 | - `none_rel_1`: the route before the change includes unknown links. 184 | - `none_rel_2`: the route after the change includes unknown links. 185 | - `as_prepend_1`: the route before the change includes prepended ASes. 186 | - `as_prepend_2`: the route after the change includes prepended ASes. 187 | - `detour_country`: the alarm includes route detouring through other countries. 188 | - `path1_in_path2`: the route before the change is the subset of that after the change. 189 | - `path2_in_path1`: the route after the change is the subset of that before the change. 190 | 191 | After the properties are associated, run `post_processor/summary_routeviews.py` to generate an HTML report about the month's detection results. See all available parameters with `--help`. 192 | 193 | An example run is as follows: 194 | 195 | ```bash 196 | python summary_routeviews.py \ 197 | --collector wide \ 198 | --year 2024 \ 199 | --month 8 200 | ``` 201 | 202 | This example will generate an HTML report and a JSON-line-format file from the alarms generated in [Step 4](#4-detect-anomalies). 203 | 204 | Notes: 205 | 206 | - The HTML report is stored in `post_processor/html/`, and the JSON-line-format file is stored in `post_processor/summary_output/`. 207 | 208 | - The HTML report is self-contained, with necessary descriptions of the terms used. 209 | 210 | ## Future Work 211 | 212 | **Updated on Sep. 13, 2024**: 213 | 214 | A full-featured, open-source version of the anomaly detection system is under development, aimed at deployment in production environments such as ISPs, and potentially as a public service to monitor the Internet and issue BGP anomaly warnings. This includes plans to refactor key functions using the Rust programming language and package them as Crates for public access. The current organization of these components is as follows: 215 | 216 | - **In progress:** A module to synchronize routing data in real time from RouteViews, RIPE RIS, and self-operated or peering ASes, stored locally in a database. 217 | - SQLite for local storage and management of routing data. 218 | - Part of functions from BGPstream. 219 | - Integration of BGPdump. 220 | - KVM/Docker support for virtual routers. 221 | - **Pending:** A module to train BEAM models using the latest CAIDA AS relationship data. 222 | - **Pending:** A module to process real-time routing data and detect anomalies. 223 | - **Pending:** A website or app for displaying and analyzing detection results in real time. 224 | 225 | --- 226 | -------------------------------------------------------------------------------- /post_processor/summary_routeviews.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | import json 5 | import pandas as pd 6 | import numpy as np 7 | from pathlib import Path 8 | from datetime import datetime 9 | import subprocess 10 | import calendar 11 | import click 12 | 13 | import sys 14 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 15 | from anomaly_detector.utils import event_aggregate 16 | 17 | @click.command() 18 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to generate the report") 19 | @click.option("--year", "-y", type=int, required=True, help="the year of the detection results, e.g., 2024") 20 | @click.option("--month", "-m", type=int, required=True, help="the month of the detection results, e.g., 8") 21 | def main(collector, year, month): 22 | repo_dir = Path(__file__).resolve().parent.parent 23 | collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector 24 | reported_alarm_dir = collector_result_dir/"reported_alarms"/f"{year}{month:02d}" 25 | route_change_dir = collector_result_dir/"route_change" 26 | info = json.load(open(reported_alarm_dir/f"info_{year}{month:02}.json", "r")) 27 | flags_dir = reported_alarm_dir.parent/f"{year}{month:02d}.flags" 28 | 29 | summary_dir = Path(__file__).resolve().parent/"summary_output" 30 | summary_dir.mkdir(parents=True, exist_ok=True) 31 | 32 | html_dir = Path(__file__).resolve().parent/"html" 33 | 34 | metric = "diff_balance" 35 | 36 | def has_flag(v): 37 | return v != "-" 38 | v_flag = np.vectorize(has_flag) 39 | 40 | def invalid_asn(v): 41 | return v == "Invalid" 42 | v_invalid_asn = np.vectorize(invalid_asn) 43 | 44 | def invalid_len(v): 45 | return v == "invalid_length" 46 | v_invalid_len = np.vectorize(invalid_len) 47 | 48 | def valid(v): 49 | return v == "valid" 50 | v_valid = np.vectorize(valid) 51 | 52 | def summary(): 53 | global_group_id = 0 54 | dfs = [] 55 | for i in info: 56 | if i["save_path"] is None: continue 57 | df = pd.read_csv(i["save_path"]) 58 | flags = pd.read_csv(flags_dir/f"{Path(i['save_path']).stem}.flags.csv") 59 | df = pd.concat([df, flags], axis=1) 60 | 61 | # highly possible origin hijack 62 | anomaly_t1 = df["origin_change"] \ 63 | & (~v_flag(df["origin_same_org"])) \ 64 | & ((v_invalid_asn(df["origin_rpki_1"]) 65 | ^ v_invalid_asn(df["origin_rpki_2"])) 66 | | (v_invalid_asn(df["origin_irr_1"]) 67 | ^ v_invalid_asn(df["origin_irr_2"])) 68 | | (df["origin_whois_1"] ^ df["origin_whois_2"])) 69 | 70 | # highly possible route leak 71 | anomaly_t2 = v_flag(df["non_valley_free_1"]) \ 72 | | v_flag(df["non_valley_free_2"]) 73 | 74 | # highly possible path manipulation 75 | anomaly_t3 = v_flag(df["reserved_path_1"]) \ 76 | | v_flag(df["reserved_path_2"]) \ 77 | | v_flag(df["none_rel_1"]) \ 78 | | v_flag(df["none_rel_2"]) \ 79 | | np.isinf(df["diff"]) 80 | exception_t3 = (~v_flag(df["reserved_path_1"])) \ 81 | & (~v_flag(df["reserved_path_2"])) \ 82 | & (v_flag(df["none_rel_1"]) \ 83 | | v_flag(df["none_rel_2"])) 84 | 85 | # highly possible ROA/IRR/WHOIS misconfiguration 86 | anomaly_t4 = v_flag(df["origin_same_org"]) \ 87 | & ((v_invalid_asn(df["origin_rpki_1"]) 88 | ^ v_invalid_asn(df["origin_rpki_2"])) 89 | | (v_invalid_asn(df["origin_irr_1"]) 90 | ^ v_invalid_asn(df["origin_irr_2"])) 91 | | (df["origin_whois_1"] ^ df["origin_whois_2"])) 92 | 93 | # highly possible benign MOAS 94 | benign_t1 = df["origin_change"] \ 95 | & (v_flag(df["origin_same_org"]) 96 | | v_flag(df["origin_connection"]) 97 | | (v_valid(df["origin_rpki_1"]) 98 | & v_valid(df["origin_rpki_2"]))) 99 | 100 | # highly possible AS prepending 101 | benign_t2 = (~df["origin_change"]) \ 102 | & (has_flag(df["as_prepend_1"]) 103 | ^ has_flag(df["as_prepend_2"])) 104 | 105 | # highly possible multi-homing 106 | benign_t3 = v_flag(df["origin_different_upstream"]) 107 | 108 | # no any sign of anomaly 109 | benign_t4 = (~df["detour_country"]) \ 110 | & v_valid(df["origin_rpki_1"]) \ 111 | & v_valid(df["origin_rpki_2"]) 112 | 113 | # possible false alarms due to the nature of diff computation 114 | benign_t5 = (df["path_l1"]+df["path_l2"])/2 <= 3 115 | 116 | # possible prefix transfer 117 | benign_t6 = df["path1_in_path2"] 118 | 119 | 120 | df["a1"] = anomaly_t1 121 | df["a2"] = anomaly_t2 122 | df["a3"] = anomaly_t3 123 | df["a4"] = anomaly_t4 124 | df["b1"] = benign_t1 125 | df["b2"] = benign_t2 126 | df["b3"] = benign_t3 127 | df["b4"] = benign_t4 128 | df["b5"] = benign_t5 129 | df["b6"] = benign_t6 130 | 131 | anomaly = anomaly_t1 | anomaly_t2 | anomaly_t3 | anomaly_t4 132 | benign = benign_t1 | benign_t2 | benign_t3 | benign_t4 | benign_t5 | benign_t6 133 | 134 | df["pattern"] = "unknown" 135 | df.loc[benign, ["pattern"]] = "benign" 136 | df.loc[anomaly, ["pattern"]] = "anomaly" 137 | 138 | df = df.loc[anomaly | (~benign)] # post-filtering 139 | df = df.loc[(~exception_t3)|anomaly_t1|anomaly_t2|anomaly_t4] 140 | 141 | event_key = i["event_key"] 142 | forwarder_th = i["forwarder_th"] 143 | 144 | events = {} 145 | for key,ev in df.groupby(event_key): # re-grouping and filtering 146 | if ev.shape[0] <= forwarder_th: continue 147 | events[key] = ev 148 | 149 | if events: 150 | _, df = event_aggregate(events) 151 | n_alarms = len(df["group_id"].unique()) 152 | assert np.max(df["group_id"]) == n_alarms-1, f"{np.max(df['group_id'])}, {n_alarms-1}" 153 | df["group_id"] += global_group_id 154 | global_group_id += n_alarms 155 | dfs.append(df) 156 | 157 | df = pd.concat(dfs) 158 | df.to_csv(summary_dir/f"alarms_after_post_process_{collector}_{year}{month:02}.csv", index=False) 159 | return df 160 | 161 | df = summary() 162 | 163 | def reason(tag, row): 164 | if tag == "a1": 165 | fields = ["origin_rpki_1", "origin_rpki_2", "origin_irr_1", "origin_irr_2", "origin_whois_1", "origin_whois_2"] 166 | elif tag == "a2": 167 | fields = ["non_valley_free_1", "non_valley_free_2"] 168 | elif tag == "a3": 169 | fields = ["reserved_path_1", "reserved_path_2", 170 | "none_rel_1", "none_rel_2", "unknown_asn_1", "unknown_asn_2"] 171 | elif tag == "a4": 172 | fields = ["origin_same_org", "origin_rpki_1", "origin_rpki_2", "origin_irr_1", "origin_irr_2", "origin_whois_1", "origin_whois_2"] 173 | elif tag == "b1": 174 | fields = ["origin_same_org", "origin_connection", 175 | "origin_rpki_1", "origin_rpki_2"] 176 | elif tag == "b2": 177 | fields = ["as_prepend_1", "as_prepend_2"] 178 | elif tag == "b3": 179 | fields = ["origin_different_upstream"] 180 | elif tag == "b4": 181 | fields = ["origin_rpki_1", "origin_rpki_2"] 182 | elif tag == "b5": 183 | fields = [] 184 | 185 | r = {i: str(row[i]) for i in fields if has_flag(row[i])} 186 | return r 187 | 188 | def terminal_checkout(group_id, group): 189 | tags = ["a1", "a2", "a3", "a4", "b1", "b2", "b3", "b4", "b5"] 190 | 191 | print(f"alarm_id: {group_id}") 192 | for prefix_key, ev in group.groupby(["prefix1", "prefix2"]): 193 | print(f"* {' -> '.join(prefix_key)}") 194 | for _, row in ev.iterrows(): 195 | print(f" path1: {row['path1']}") 196 | print(f" path2: {row['path2']}") 197 | print(f" diff={row[metric]}") 198 | print(f" culprit={row['culprit']}") 199 | for k,v in zip(tags, row[tags]): 200 | if v: 201 | r = reason(k, row) 202 | print(f"{k}: ", end="") 203 | print(",".join([f"{x}={y}" for x,y in r.items()])) 204 | print() 205 | input("..Enter to next") 206 | 207 | def json_checkout(group_id, group): 208 | tags = ["a1", "a2", "a3", "a4", "b1", "b2", "b3", "b4", "b5"] 209 | 210 | timestamp = group["timestamp"].values 211 | fmt = "%a %d %b %Y, %I:%M%p" 212 | start_time = datetime.fromtimestamp(timestamp.min()).strftime(fmt) 213 | end_time = datetime.fromtimestamp(timestamp.max()).strftime(fmt) 214 | 215 | events = [] 216 | for prefix_key, ev in group.groupby(["prefix1", "prefix2"]): 217 | route_changes = [] 218 | for _, row in ev.iterrows(): 219 | route_changes.append({ 220 | "timestamp": int(row["timestamp"]), 221 | "path1": str(row["path1"]), 222 | "path2": str(row["path2"]), 223 | "diff": float(row[metric]), 224 | "culprit": json.loads(str(row['culprit'])), 225 | "patterns": {k: reason(k, row) for k,v in zip(tags, row[tags]) if v}, 226 | }) 227 | 228 | events.append({ 229 | "prefix": prefix_key, 230 | "route_changes": route_changes 231 | }) 232 | 233 | ret = { 234 | "group_id": group_id, 235 | "start_time": start_time, 236 | "end_time": end_time, 237 | "events": events, 238 | } 239 | # print(json.dumps(ret, indent=2)) 240 | return ret 241 | 242 | def group_html_checkout(group_id, group): 243 | tags = ["a1", "a2", "a3", "a4", "b1", "b2", "b3", "b4", "b5"] 244 | 245 | timestamp = group["timestamp"].values 246 | fmt = "%Y/%m/%d %H:%M:%S" 247 | start_time = datetime.fromtimestamp(timestamp.min()).strftime(fmt) 248 | end_time = datetime.fromtimestamp(timestamp.max()).strftime(fmt) 249 | 250 | def text_color(s, color): 251 | return f'{s}' 252 | 253 | events = [] 254 | for prefix_key, ev in group.groupby(["prefix1", "prefix2"]): 255 | route_changes = [] 256 | for _, row in ev.iterrows(): 257 | timestamp = f"

timestamp: {row['timestamp']}

" 258 | path1 = f"

path1: {row['path1']}

" 259 | path2 = f"

path2: {row['path2']}

" 260 | diff = f"

diff: {row[metric]}

" 261 | culprit = f"

culprit: {row['culprit']}

" 262 | 263 | patterns = [] 264 | for k,v in zip(tags, row[tags]): 265 | if v: 266 | r = reason(k, row) 267 | p = f"

{k}: "+",".join([f"{x}={y}" for x,y in r.items()])+"

" 268 | patterns.append(p) 269 | pattern_part = "

patterns: " 270 | if not patterns: 271 | pattern_part += "none" 272 | pattern_part += "

" 273 | 274 | rc_html = "
  • \n" 275 | rc_html+= " "+timestamp+"\n" 276 | rc_html+= " "+path1+"\n" 277 | rc_html+= " "+path2+"\n" 278 | rc_html+= " "+diff+"\n" 279 | rc_html+= " "+culprit+"\n" 280 | rc_html+= " "+pattern_part+"\n" 281 | if patterns: rc_html+= " \n" 282 | rc_html+= "
  • " 283 | 284 | route_changes.append(rc_html) 285 | 286 | 287 | c = "MediumSeaGreen" if (ev["pattern"] == "anomaly").any() else "Orange" 288 | p0, p1 = prefix_key 289 | prefix_title = f'

    {text_color(p0,c)} -> {text_color(p1,c)}

    ' 290 | route_change_part = "\n".join(route_changes) 291 | 292 | ev_html = "
  • \n" 293 | ev_html+= " "+prefix_title+"\n" 294 | ev_html+= " \n" 297 | ev_html+= "
  • \n" 298 | 299 | events.append(ev_html) 300 | 301 | mark = text_color("✔", "MediumSeaGreen") \ 302 | if (group["pattern"] == "anomaly").any() else \ 303 | text_color('✗', "Orange") 304 | group_title = f"{mark} id: {group_id}, start: {start_time}, end: {end_time}, events: {len(events)}, route_changes: {group.shape[0]}" 305 | events_part = "".join(events) 306 | 307 | html = f'\n' 308 | html+= '
    \n' 309 | html+= '\n' 312 | html+= '
    \n' 313 | 314 | return html 315 | 316 | def gen_jsonl(): 317 | anomaly_cnt = 0 318 | lines = [] 319 | for group_id, group in df.groupby("group_id"): 320 | if (group["pattern"] == "anomaly").any(): 321 | anomaly_cnt += 1 322 | jl = json_checkout(group_id, group) 323 | lines.append(json.dumps(jl)) 324 | 325 | with open(summary_dir/f"alarms_{collector}_{year}{month:02}.jsonl", "w") as f: 326 | f.write("\n".join(lines)+"\n") 327 | 328 | print(f"total groups: {group_id+1}") 329 | print(f"anomaly: {anomaly_cnt}") 330 | 331 | def terminal_display(): 332 | for group_id, group in df.groupby("group_id"): 333 | terminal_checkout(group_id, group) 334 | 335 | def stats_checkout(df): 336 | daily_cnts = np.zeros(calendar.monthrange(year, int(month))[1], dtype=int) 337 | daily_cnts_a = daily_cnts.copy() 338 | 339 | days = [datetime.fromtimestamp(np.min(g["timestamp"])).day for _,g in df.groupby("group_id")] 340 | days_a = [datetime.fromtimestamp(np.min(g["timestamp"])).day for _,g in df.groupby("group_id") 341 | if (g["pattern"] == "anomaly").any()] 342 | 343 | days, cnts = np.unique(days, return_counts=True) 344 | daily_cnts[days-1] = cnts 345 | 346 | days_a, cnts_a = np.unique(days_a, return_counts=True) 347 | daily_cnts_a[days_a-1] = cnts_a 348 | 349 | route_change_cnts = int(subprocess.run(f"wc -l {route_change_dir}/{year}{month:02}*.csv", shell=True, 350 | stdout=subprocess.PIPE, encoding='UTF-8').stdout.strip().split()[-2]) 351 | 352 | return daily_cnts, daily_cnts_a, route_change_cnts 353 | 354 | def gen_html(): 355 | sections = [] 356 | for group_id, group in df.groupby("group_id"): 357 | html = group_html_checkout(group_id, group) 358 | sections.append(html) 359 | template = open(html_dir/"template_routeviews.html", "r").read() 360 | html = template.replace("REPLACE_WITH_SECTIONS", "\n".join(sections)) 361 | html = html.replace("REPLACE_WITH_TITLE", f"{year}-{month} Report(RouteViews {collector})") 362 | 363 | 364 | daily_cnts, daily_cnts_a, route_change_cnts = stats_checkout(df) 365 | xvalues = "["+", ".join([f"{i+1:02}" for i in range(calendar.monthrange(year, int(month))[1])])+"]" 366 | yvalues_a = "["+", ".join([f"{i+1:02}" for i in daily_cnts_a])+"]" 367 | yvalues_b = "["+", ".join([f"{i+1:02}" for i in daily_cnts-daily_cnts_a])+"]" 368 | 369 | html = html.replace("REPLACE_WITH_XVALUES", xvalues) 370 | html = html.replace("REPLACE_WITH_YVALUES_A", yvalues_a) 371 | html = html.replace("REPLACE_WITH_YVALUES_B", yvalues_b) 372 | 373 | exp = "\n" 376 | 377 | html = html.replace("REPLACE_WITH_EXPLANATION", exp) 378 | 379 | open(html_dir/f"report_{collector}_{year}{month:02}.html", "w").write(html) 380 | 381 | gen_jsonl() 382 | # terminal_display() 383 | gen_html() 384 | 385 | if __name__ == "__main__": 386 | main() 387 | --------------------------------------------------------------------------------