├── doc
    ├── detection-prototype.png
    ├── 04-validation-approach.pdf
    ├── 01-datasets-description.pdf
    ├── 02-detection-system-design.pdf
    └── 03-legitimate-route-change-identification.pdf
├── .gitignore
├── data
    ├── caida_as_rel
    │   ├── query.py
    │   └── fetch_data.py
    ├── caida_as_org
    │   ├── query.py
    │   └── fetch_data.py
    ├── bgpstream
    │   ├── fetch_data.py
    │   └── locate_route_change.py
    └── routeviews
    │   ├── fetch_rib.py
    │   └── fetch_updates.py
├── post_processor
    ├── whois_lookup.py
    ├── rpki_validator.py
    ├── irr_validator.py
    ├── html
    │   └── template_routeviews.html
    ├── alarm_postprocess_routeviews.py
    └── summary_routeviews.py
├── BEAM_engine
    ├── train.py
    └── BEAM_model.py
├── routing_monitor
    ├── detect_route_change_routeviews.py
    └── monitor.py
├── anomaly_detector
    ├── BEAM_diff_evaluator_routeviews.py
    ├── report_anomaly_routeviews.py
    └── utils.py
└── readme.md


/doc/detection-prototype.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/detection-prototype.png


--------------------------------------------------------------------------------
/doc/04-validation-approach.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/04-validation-approach.pdf


--------------------------------------------------------------------------------
/doc/01-datasets-description.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/01-datasets-description.pdf


--------------------------------------------------------------------------------
/doc/02-detection-system-design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/02-detection-system-design.pdf


--------------------------------------------------------------------------------
/doc/03-legitimate-route-change-identification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhchen-tsinghua/routing-anomaly-detection/HEAD/doc/03-legitimate-route-change-identification.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.vim
 3 | 
 4 | data/routeviews/updates
 5 | data/routeviews/ribs
 6 | data/routeviews/cache
 7 | data/routeviews/bgpd
 8 | data/caida_as_rel/serial-1
 9 | data/caida_as_rel/serial-2
10 | data/caida_as_org/cache
11 | data/caida_as_org/fetched_data
12 | data/bgpstream/cache
13 | data/bgpstream/event
14 | 
15 | __pycache__
16 | 
17 | BEAM_engine/models/
18 | BEAM_engine/models
19 | 
20 | routing_monitor/detection_result/
21 | routing_monitor/detection_result
22 | 
23 | post_processor/rpki_cache/
24 | post_processor/rpki_cache
25 | 


--------------------------------------------------------------------------------
/data/caida_as_rel/query.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | import click
 6 | 
 7 | SCRIPT_DIR = Path(__file__).resolve().parent
 8 | 
 9 | def load(serial, time):
10 |     f = SCRIPT_DIR/f"serial-{serial}"/f"{time}.as-rel{'' if serial == '1' else 2}.txt"
11 | 
12 |     ngbrs = dict()
13 |     for line in open(f, "r"):
14 |         if line[0] == "#": continue
15 |         i, j, k = line.strip().split("|")[:3]
16 |         ngbrs.setdefault(i, {-1: set(), 0: set(), 1: set()})[int(k)].add(j)
17 |         ngbrs.setdefault(j, {-1: set(), 0: set(), 1: set()})[-int(k)].add(i)
18 | 
19 |     def query(i, j):
20 |         if i not in ngbrs: print(f"Unknown AS: {i}"); return None
21 |         if j not in ngbrs: print(f"Unknown AS: {j}"); return None
22 |         for k,v in ngbrs[i].items():
23 |             if j in v: return k
24 |         return None
25 | 
26 |     return query
27 | 
28 | 
29 | @click.command()
30 | @click.option("--serial", "-s", type=click.Choice(["1", "2"]), default="1", help="serial 1 or 2")
31 | @click.option("--time", "-t", type=int, required=True, help="timestamp, e.g., 20200901")
32 | def main(serial, time):
33 |     query = load(serial, time)
34 | 
35 |     while True:
36 |         i = input("AS1: ")
37 |         j = input("AS2: ")
38 |         print(query(i, j))
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/post_processor/whois_lookup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | import subprocess
 5 | from datetime import datetime
 6 | from pathlib import Path
 7 | import re
 8 | 
 9 | script_dir = Path(__file__).resolve().parent
10 | cache_dir = script_dir/"whois_cache"
11 | cache_dir.mkdir(parents=True, exist_ok=True)
12 | 
13 | def whois_lookup(target, cache_date=datetime.now().strftime("%Y-%m-%d")):
14 |     cache_file = cache_dir/f"{target.replace('/', '_')}.{cache_date}.txt"
15 |     if cache_file.exists():
16 |         with cache_file.open("r", encoding="utf-8") as f:
17 |             content = f.read()
18 |     else:
19 |         try:
20 |             result = subprocess.run(["whois", target],
21 |                         text=True, capture_output=True, check=True)
22 |             content = result.stdout
23 |             with cache_file.open("w", encoding="utf-8") as f:
24 |                 f.write(content)
25 |         except Exception as e:
26 |             print(f"Failed to perform WHOIS lookup for {target}: {e}")
27 |             content = ""
28 |     return content
29 | 
30 | def whois_match(prefix_str, asn_str):
31 |     whois_content = whois_lookup(prefix_str)
32 |     for line in whois_content.split("\n"):
33 |         if not line or line.startswith("%"): continue
34 |         match = re.match(r"^(\S+):\s+(.*)$", line)
35 |         if match:
36 |             _, value = match.groups()
37 |             for asn_value in re.findall(r"as\d+", value):
38 |                 if f"as{asn_str}" == asn_value:
39 |                     return True
40 |     return False
41 | 


--------------------------------------------------------------------------------
/data/caida_as_org/query.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | import click
 6 | 
 7 | SCRIPT_DIR = Path(__file__).resolve().parent
 8 | 
 9 | def load(time):
10 |     fname = f"{time}.as-org2info.txt"
11 |     lines = open(SCRIPT_DIR/"fetched_data"/fname, "r").readlines()
12 |     field1 = "aut|changed|aut_name|org_id|opaque_id|source".split("|")
13 |     field2 = "org_id|changed|name|country|source".split("|")
14 |     as_info = {}
15 |     org_info = {}
16 |     for l in lines:
17 |         if l[0] == "#": continue
18 |         values = l.strip().split("|")
19 |         if len(values) == len(field1):
20 |             if values[0] in as_info and values[1] < as_info[values[0]]["changed"]: continue
21 |             as_info[values[0]] = dict(zip(field1[1:], values[1:]))
22 |         if len(values) == len(field2):
23 |             if values[0] in org_info and values[1] < org_info[values[0]]["changed"]: continue
24 |             org_info[values[0]] = dict(zip(field2[1:], values[1:]))
25 |     return as_info, org_info
26 | 
27 | @click.command()
28 | @click.option("--time", "-t", type=int, required=True, help="timestamp, like 20200901")
29 | def main(time):
30 |     as_info, org_info = load(time)
31 |     while True:
32 |         inp = input("ASN or org_id: ")
33 |         if inp in as_info:
34 |             print(f"asn: {inp}, {as_info[inp]}")
35 |         elif inp in org_info:
36 |             print(f"org_id: {inp}, {org_info[inp]}")
37 |         else:
38 |             print("no result")
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/data/caida_as_rel/fetch_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | import subprocess
 6 | import click
 7 | 
 8 | SCRIPT_DIR = Path(__file__).resolve().parent
 9 | 
10 | SERIAL_1_DIR = SCRIPT_DIR / "serial-1"
11 | SERIAL_2_DIR = SCRIPT_DIR / "serial-2"
12 | 
13 | SERIAL_1_DIR.mkdir(exist_ok=True, parents=True)
14 | SERIAL_2_DIR.mkdir(exist_ok=True, parents=True)
15 | 
16 | def get(serial: str, time: int):
17 |     if serial == "1":
18 |         fname = f"{time}.as-rel.txt.bz2"
19 |         obj = f"https://publicdata.caida.org/datasets/as-relationships/serial-1/{fname}"
20 |         out = SERIAL_1_DIR / fname
21 |     elif serial == "2":
22 |         fname = f"{time}.as-rel2.txt.bz2"
23 |         obj = f"https://publicdata.caida.org/datasets/as-relationships/serial-2/{fname}"
24 |         out = SERIAL_2_DIR / fname
25 |     else:
26 |         raise RuntimeError("bad argument")
27 |     if out.with_suffix("").exists():
28 |         # print(f"as-relationship for {serial} {time} already existed")
29 |         return out.with_suffix("")
30 |     subprocess.run(["curl", obj, "--output", str(out)], check=True)
31 |     subprocess.run(["bzip2", "-d", str(out)], check=True)
32 |     print(f"get as-relationship for {serial} {time}")
33 |     return out.with_suffix("")
34 | 
35 | @click.command()
36 | @click.option("--serial", "-s", type=click.Choice(["1", "2"]), default="1", help="serial 1 or 2")
37 | @click.option("--time", "-t", type=int, required=True, help="timestamp, e.g., 20200901")
38 | def main(serial, time):
39 |     get(serial, time)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/BEAM_engine/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | from BEAM_model import BEAM
 6 | from shutil import get_terminal_size
 7 | import click
 8 | import os
 9 | 
10 | import sys
11 | sys.path.append(str(Path(__file__).resolve().parent.parent))
12 | 
13 | from data.caida_as_rel.fetch_data import get as prepare_edge_file
14 | 
15 | @click.command()
16 | @click.option("--serial", "-s", type=click.Choice(["1", "2"]), default="1", help="serial 1 or 2")
17 | @click.option("--time", "-t", type=int, required=True, help="timestamp, e.g., 20200901")
18 | @click.option("--Q", "Q", type=int, default=10, help="hyperparameter Q, e.g., 10")
19 | @click.option("--dimension", type=int, default=128, help="hyperparameter dimension size, e.g., 128")
20 | @click.option("--epoches", type=int, default=1000, help="epoches to train, e.g., 1000")
21 | @click.option("--device", type=int, default=0, help="device to train on")
22 | @click.option("--num-workers", type=int, default=1, help="number of workers")
23 | def main(serial, time, device, **model_params):
24 |     os.environ["CUDA_VISIBLE_DEVICES"] = f"{device}"
25 | 
26 |     edge_file = prepare_edge_file(serial, time)
27 |     assert edge_file.exists(), f"fail to prepare {edge_file}"
28 | 
29 |     model_params["edge_file"] = edge_file
30 | 
31 |     for k, v in model_params.items():
32 |         print(f"{k}: {v}")
33 |     print("*"*get_terminal_size().columns)
34 |     # input("Press Enter to start.")
35 | 
36 |     train_dir = Path(__file__).resolve().parent/"models"/ \
37 |         f"{edge_file.stem}.{model_params['epoches']}.{model_params['Q']}.{model_params['dimension']}"
38 |     train_dir.mkdir(parents=True, exist_ok=True)
39 |     model_params["train_dir"] = train_dir
40 |     epoches = model_params.pop("epoches")
41 | 
42 |     model = BEAM(**model_params)
43 |     model.train(epoches=epoches)
44 |     model.save_embeddings(path=str(train_dir))
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/data/caida_as_org/fetch_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | from urllib.parse import urljoin
 6 | import numpy as np
 7 | import json
 8 | import subprocess
 9 | import click
10 | import re
11 | 
12 | SCRIPT_DIR = Path(__file__).resolve().parent
13 | CACHE_DIR = SCRIPT_DIR/"cache"
14 | CACHE_DIR.mkdir(parents=True, exist_ok=True)
15 | OUTPUT_DIR = SCRIPT_DIR/"fetched_data"
16 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
17 | 
18 | def get_archive_list(refresh=False):
19 |     cache_path = CACHE_DIR/f"time2url"
20 |     if cache_path.exists() and not refresh:
21 |         try: return json.load(open(cache_path, "r"))
22 |         except: pass
23 | 
24 |     url_index = "https://publicdata.caida.org/datasets/as-organizations/"
25 |     res = subprocess.check_output(["curl", "-s", url_index]).decode()
26 |     res = re.sub(r"\s\s+", " ", res.replace("\n", " "))
27 |     time2url = {}
28 |     for fname, time in re.findall(r'\<a href="((\d{8}).as-org2info.txt.gz)"\>', res):
29 |         time2url[time] = urljoin(url_index, fname)
30 | 
31 |     json.dump(time2url, open(cache_path, "w"), indent=2)
32 |     return time2url
33 | 
34 | def get_most_recent(time):
35 |     time2url = get_archive_list()
36 |     times = sorted(time2url.keys())
37 |     idx = np.searchsorted(times, time, "right")
38 | 
39 |     target_time = times[idx-1]
40 |     target_url = time2url[target_time]
41 | 
42 |     out = OUTPUT_DIR/target_url.split("/")[-1]
43 |     if out.with_suffix("").exists():
44 |         # print(f"as-organizations for {target_time} exists")
45 |         return target_time, out.with_suffix("")
46 | 
47 |     subprocess.run(["curl", target_url, "--output", str(out)], check=True)
48 |     subprocess.run(["gzip", "-d", str(out)], check=True)
49 |     print(f"get as-organizations for {target_time}")
50 |     return target_time, out.with_suffix("")
51 | 
52 | @click.command()
53 | @click.option("--time", "-t", type=str, required=True, help="timestamp, e.g., 20200901")
54 | def main(time):
55 |     get_most_recent(time)
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/routing_monitor/detect_route_change_routeviews.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | import pandas as pd
 6 | from datetime import datetime, timedelta
 7 | import pickle
 8 | import click
 9 | 
10 | import sys
11 | sys.path.append(str(Path(__file__).resolve().parent.parent))
12 | 
13 | from data.routeviews.fetch_updates import load_updates_to_df, get_all_collectors, get_archive_list, download_data
14 | from monitor import Monitor
15 | 
16 | SCRIPT_DIR = Path(__file__).resolve().parent
17 | 
18 | def detect(data, route_change_dir, snapshot_dir):
19 |     mon = Monitor()
20 | 
21 |     for fpath in data:
22 |         _, date, time = fpath.name.split(".")
23 | 
24 |         df = load_updates_to_df(fpath)
25 |         df = df.sort_values(by="timestamp")
26 | 
27 |         mon.consume(df, detect=True)
28 | 
29 |         route_change_df = pd.DataFrame.from_records(mon.route_changes)
30 |         mon.route_changes = []
31 | 
32 |         route_change_df.to_csv(route_change_dir/f"{date}.{time}.csv", index=False)
33 | 
34 |         if time == "2345":
35 |             pickle.dump(mon, open(snapshot_dir/f"{date}.end-of-the-day", "wb"))
36 | 
37 | @click.command()
38 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to use")
39 | @click.option("--year", "-y", type=int, required=True, help="the year to monitor, e.g., 2024")
40 | @click.option("--month", "-m", type=int, required=True, help="the month to monitor, e.g., 8")
41 | def detect_monthly_for(collector, year, month):
42 |     result_dir = SCRIPT_DIR/"detection_result"/collector
43 |     route_change_dir = result_dir/"route_change"
44 |     snapshot_dir = result_dir/"snapshot"
45 | 
46 |     route_change_dir.mkdir(exist_ok=True, parents=True)
47 |     snapshot_dir.mkdir(exist_ok=True, parents=True)
48 | 
49 |     collectors2url = get_all_collectors()
50 | 
51 |     d1 = datetime(year=year, month=month, day=1)
52 |     d2 = (datetime(year=year, month=month, day=28) + timedelta(days=4)
53 |             ).replace(day=1) - timedelta(minutes=15)
54 | 
55 |     monthly_data = list(map(lambda url: download_data(url, collector),
56 |                     get_archive_list(collector, collectors2url, d1, d2)))
57 | 
58 |     detect(monthly_data, route_change_dir, snapshot_dir)
59 | 
60 | if __name__ == "__main__":
61 |     detect_monthly_for()
62 | 


--------------------------------------------------------------------------------
/anomaly_detector/BEAM_diff_evaluator_routeviews.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | from functools import lru_cache
 5 | from pathlib import Path
 6 | import pandas as pd
 7 | import click
 8 | 
 9 | from utils import load_emb_distance
10 | 
11 | repo_dir = Path(__file__).resolve().parent.parent
12 | model_dir = repo_dir/"BEAM_engine"/"models"
13 | 
14 | @click.command()
15 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector that the route changes to evaluate are from")
16 | @click.option("--year", "-y", type=int, required=True, help="the year of the route changes monitored, e.g., 2024")
17 | @click.option("--month", "-m", type=int, required=True, help="the month of the route changes monitored, e.g., 8")
18 | @click.option("--beam-model", "-b", type=str, required=True, help="the trained BEAM model to use, e.g., 20240801.as-rel2.1000.10.128")
19 | def evaluate_monthly_for(collector, year, month, beam_model):
20 |     collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector
21 |     route_change_dir = collector_result_dir/"route_change"
22 |     beam_metric_dir = collector_result_dir/"BEAM_metric"
23 |     beam_metric_dir.mkdir(exist_ok=True, parents=True)
24 | 
25 |     emb_dir = model_dir/beam_model
26 |     emb_d, dtw_d, path_d, emb, _, _ = load_emb_distance(emb_dir, return_emb=True)
27 | 
28 |     def dtw_d_only_exist(s, t):
29 |         return dtw_d([i for i in s if i in emb], [i for i in t if i in emb])
30 | 
31 |     for i in route_change_dir.glob(f"{year}{month:02d}*.csv"):
32 |         beam_metric_file = beam_metric_dir/f"{i.stem}.bm.csv"
33 |         if beam_metric_file.exists(): continue
34 | 
35 |         df = pd.read_csv(i)
36 | 
37 |         path1 = [s.split(" ") for s in df["path1"].values]
38 |         path2 = [t.split(" ") for t in df["path2"].values]
39 | 
40 |         metrics = pd.DataFrame.from_dict({
41 |             "diff": [dtw_d(s,t) for s,t in zip(path1, path2)], 
42 |             "diff_only_exist": [dtw_d_only_exist(s,t) for s,t in zip(path1, path2)], 
43 |             "path_d1": [path_d(i) for i in path1],
44 |             "path_d2": [path_d(i) for i in path2],
45 |             "path_l1": [len(i) for i in path1],
46 |             "path_l2": [len(i) for i in path2],
47 |             "head_tail_d1": [emb_d(i[0], i[-1]) for i in path1],
48 |             "head_tail_d2": [emb_d(i[0], i[-1]) for i in path2],
49 |         })
50 |         
51 |         metrics.to_csv(beam_metric_file, index=False)
52 | 
53 | if __name__ == "__main__":
54 |     evaluate_monthly_for()
55 | 


--------------------------------------------------------------------------------
/routing_monitor/monitor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import ipaddress
 3 | 
 4 | class Monitor:
 5 |     class Node:
 6 |         def __init__(self):
 7 |             self.routes     = dict() # forwarder -> aspath
 8 |             self.left       = None
 9 |             self.right      = None
10 | 
11 |         def get_left(self):
12 |             if self.left is None:
13 |                 self.left = Monitor.Node()
14 |             return self.left
15 | 
16 |         def get_right(self):
17 |             if self.right is None:
18 |                 self.right = Monitor.Node()
19 |             return self.right
20 | 
21 |         def find_route(self, forwarder):
22 |             if forwarder in self.routes:
23 |                 return self.routes[forwarder]
24 |             return None
25 | 
26 |     def __init__(self):
27 |         self.root = Monitor.Node()
28 |         self.route_changes = []
29 | 
30 |     def update(self, timestamp, prefix_str, vantage_point, aspath_str, detect):
31 |         prefix = ipaddress.ip_network(prefix_str)
32 | 
33 |         if prefix.version == 6: return
34 |         prefixlen = prefix.prefixlen
35 |         prefix = int(prefix[0]) >> (32-prefixlen)
36 | 
37 |         aspath = aspath_str.split(" ")
38 |         forwarder = aspath[0] # NOTE: forwarder could be vantage point, or could not
39 | 
40 |         n = self.root
41 |         original_route = None
42 |         for shift in range(prefixlen-1, -1, -1): # find the original route
43 |             left = (prefix >> shift) & 1
44 | 
45 |             if left: n = n.get_left()
46 |             else: n = n.get_right()
47 |             
48 |             if n.find_route(forwarder) is not None:
49 |                 original_route = [shift, n.find_route(forwarder)]
50 | 
51 |         if detect and original_route is not None:
52 |             shift, original_path = original_route
53 |             vict_prefix = ipaddress.ip_network(prefix_str) \
54 |                             .supernet(new_prefix=prefixlen-shift)
55 |             if aspath != original_path:
56 |                 self.route_changes.append({
57 |                     "timestamp"    : timestamp,
58 |                     "vantage_point": vantage_point,
59 |                     "forwarder"    : forwarder,
60 |                     "prefix1"      : str(vict_prefix),
61 |                     "prefix2"      : prefix_str,
62 |                     "path1"        : " ".join(original_path),
63 |                     "path2"        : " ".join(aspath),
64 |                 })
65 | 
66 |         n.routes[forwarder] = aspath
67 | 
68 |     def consume(self, df, detect=False):
69 |         if "A/W" in df.columns:
70 |             df = df.loc[df["A/W"] == "A"] # NOTE: fair move
71 |         cols = ["timestamp", "prefix", "peer-asn", "as-path"]
72 | 
73 |         for a in df[cols].values:
74 |             self.update(*a, detect=detect)
75 | 


--------------------------------------------------------------------------------
/post_processor/rpki_validator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | import requests
  4 | import ipaddress
  5 | import lzma
  6 | from pathlib import Path
  7 | import pandas as pd
  8 | 
  9 | script_dir = Path(__file__).resolve().parent
 10 | cache_dir = script_dir/"rpki_cache"
 11 | cache_dir.mkdir(parents=True, exist_ok=True)
 12 | 
 13 | def fetch_and_uncompress_xz(url, output_path):
 14 |     if output_path.exists():
 15 |         return output_path
 16 |     try:
 17 |         temp_xz_file = output_path.with_suffix(output_path.suffix + ".xz")
 18 | 
 19 |         response = requests.get(url, stream=True)
 20 |         response.raise_for_status()
 21 | 
 22 |         with temp_xz_file.open("wb") as file:
 23 |             for chunk in response.iter_content(chunk_size=8192):
 24 |                 file.write(chunk)
 25 | 
 26 |         with lzma.open(temp_xz_file, "rb") as xz_file:
 27 |             with output_path.open("wb") as out_file:
 28 |                 out_file.write(xz_file.read())
 29 | 
 30 |         temp_xz_file.unlink()
 31 | 
 32 |         return output_path
 33 | 
 34 |     except requests.RequestException as e:
 35 |         print(f"Error fetching file from {url}: {e}")
 36 |     except lzma.LZMAError as e:
 37 |         print(f"Error decompressing the .xz file: {e}")
 38 |     except Exception as e:
 39 |         print(f"An unexpected error occurred: {e}")
 40 | 
 41 | # check this out if the current one is down: http://josephine.sobornost.net/
 42 | def sync_cache(year, month, day, source="https://ftp.ripe.net/rpki"):
 43 |     dfs = []
 44 |     for rir in ["apnic", "afrinic", "arin", "lacnic", "ripencc"]:
 45 |         url = f"{source}/{rir}.tal/{year}/{month:02d}/{day:02d}/roas.csv.xz"
 46 |         output_path = cache_dir/f"roas-{rir}-{year}{month:02d}{day:02d}.csv"
 47 |         df = pd.read_csv(fetch_and_uncompress_xz(url, output_path))
 48 |         df["TA"] = rir
 49 |         dfs.append(df)
 50 |     return pd.concat(dfs, ignore_index=True)
 51 | 
 52 | class RPKI:
 53 |     class PrefixNode:
 54 |         def __init__(self):
 55 |             self.left       = None
 56 |             self.right      = None
 57 |             self.data       = []
 58 | 
 59 |         def get_left(self):
 60 |             if self.left is None:
 61 |                 self.left = RPKI.PrefixNode()
 62 |             return self.left
 63 | 
 64 |         def get_right(self):
 65 |             if self.right is None:
 66 |                 self.right = RPKI.PrefixNode()
 67 |             return self.right
 68 | 
 69 |         def update_data(self, **kwargs):
 70 |             self.data.append(kwargs)
 71 | 
 72 |     def __init__(self):
 73 |         self.root = RPKI.PrefixNode()
 74 | 
 75 |     def load_data(self, year, month, day):
 76 |         df = sync_cache(year, month, day)
 77 |         for _, row in df.iterrows():
 78 |             if row["IP Prefix"][-2:] == "/0": continue
 79 |             directions = self.prefix_to_dirs(row["IP Prefix"])
 80 |             if not directions: continue
 81 |             self.create_node(directions).update_data(**row.to_dict())
 82 |         return self
 83 | 
 84 |     @staticmethod
 85 |     def prefix_to_dirs(prefix_str):
 86 |         prefix = ipaddress.ip_network(prefix_str)
 87 |         if prefix.version == 6: return None
 88 |         prefixlen = prefix.prefixlen
 89 |         prefix = int(prefix[0]) >> (32-prefixlen)
 90 |         directions = [(prefix>>shift)&1
 91 |                         for shift in range(prefixlen-1, -1, -1)]
 92 |         return directions
 93 | 
 94 |     def create_node(self, directions):
 95 |         n = self.root
 96 |         for left in directions:
 97 |             if left: n = n.get_left()
 98 |             else: n = n.get_right()
 99 |         return n
100 | 
101 |     def match_node(self, directions):
102 |         matched = []
103 |         n = self.root
104 |         for left in directions:
105 |             if left: n = n.get_left()
106 |             else: n = n.get_right()
107 |             if n is None: break
108 |             if n.data: matched += n.data
109 |         return matched
110 | 
111 |     def validate(self, prefix_str, asn_str):
112 |         directions = self.prefix_to_dirs(prefix_str)
113 | 
114 |         if not directions: return "Not Found"
115 | 
116 |         matched = self.match_node(directions)
117 | 
118 |         if not matched: return "Not Found"
119 | 
120 |         for roa in matched:
121 |             if int(prefix_str.split("/")[-1]) <= int(roa["Max Length"]) \
122 |                     and f"AS{asn_str}" == roa["ASN"]:
123 |                 return "Valid"
124 | 
125 |         return "Invalid"
126 | 
127 |     def all_matched(self, prefix_str):
128 |         directions = self.prefix_to_dirs(prefix_str)
129 | 
130 |         if not directions: return []
131 | 
132 |         matched = self.match_node(directions)
133 | 
134 |         return matched
135 | 


--------------------------------------------------------------------------------
/data/bgpstream/fetch_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | from pathlib import Path
  5 | import numpy as np
  6 | from urllib.parse import urljoin
  7 | from concurrent.futures import ThreadPoolExecutor
  8 | import subprocess
  9 | import json
 10 | import re
 11 | 
 12 | SCRIPT_DIR = Path(__file__).resolve().parent
 13 | CACHE_DIR = SCRIPT_DIR/"cache"
 14 | CACHE_DIR.mkdir(parents=True, exist_ok=True)
 15 | 
 16 | url_index="https://bgpstream.crosswork.cisco.com/"
 17 | 
 18 | def get_page(url):
 19 |     page = subprocess.check_output(["curl", "-s", url]).decode()
 20 |     return page
 21 | 
 22 | def item_parser(index_page):
 23 |     events = []
 24 |     for item_str in re.finditer(r'\<tr\>.+?\</tr\>', index_page, flags=re.DOTALL):
 25 |         try:
 26 |             item_str = item_str[0]
 27 |             item = dict()
 28 |             for k, v in re.findall(r'\<td class="(.+?)"\>(.+?)\</td\>',
 29 |                     item_str, flags=re.DOTALL):
 30 |                 v = re.sub(r"\s\s+", " ", v.replace("\n", " ")).strip()
 31 | 
 32 |                 if k == "asn":
 33 |                     asns = re.findall(r'\(AS (\d+?)\)', v, flags=re.DOTALL)
 34 | 
 35 |                     if item["event_type"] == "Outage":
 36 |                         item["asn"] = asns
 37 |                     elif item["event_type"] == "Possible Hijack":
 38 |                         expected, detected = asns
 39 |                         item["expected_asn"] = expected
 40 |                         item["detected_asn"] = detected
 41 |                     elif item["event_type"] == "BGP Leak":
 42 |                         origin, leaker = asns
 43 |                         item["origin_asn"] = origin
 44 |                         item["leaker_asn"] = leaker
 45 |                     else:
 46 |                         raise RuntimeError(
 47 |                                 f"Uncovered event_type: {item['event_type']}")
 48 |                 elif k == "country" and v:
 49 |                     item["country"] = v.split(" ")[0]
 50 |                 elif k == "moredetail":
 51 |                     v = re.search(r'\<a href="(.+?(\d+))"\>', v)
 52 |                     if v is None:
 53 |                         item["moredetail"] = ""
 54 |                         item["event_id"] = ""
 55 |                     else:
 56 |                         item["moredetail"] = v[1]
 57 |                         item["event_id"] = v[2]
 58 |                 else:
 59 |                     item[k] = v
 60 |             events.append(item)
 61 |         except Exception as e:
 62 |             print(e)
 63 |             continue
 64 |     ids = np.array([int(i["event_id"]) for i in events])
 65 |     sort_idx = np.argsort(ids)
 66 |     events = np.array(events)[sort_idx].tolist()
 67 |     ids = ids[sort_idx].tolist()
 68 |     return events, ids
 69 | 
 70 | def update_cache():
 71 |     index_page = get_page(url_index)
 72 |     events, ids = item_parser(index_page)
 73 | 
 74 |     current_id = [int(i.stem) for i in CACHE_DIR.glob("*.jsonl")]
 75 |     current_max_id = max(current_id) if current_id else -1
 76 |     start_idx = np.searchsorted(ids, current_max_id, "right")
 77 | 
 78 |     events = events[start_idx:]
 79 |     if not events:
 80 |         print("No need to update.")
 81 |         return
 82 | 
 83 |     def fetch_for_detail(ev):
 84 |         if ev["event_type"] == "Possible Hijack":
 85 |             detail_page = get_page(urljoin(url_index, ev["moredetail"]))
 86 |             pattern = r'Expected prefix: (.+?/\d{1,2})'
 87 |             expected = re.search(pattern, detail_page)
 88 |             if expected is not None:
 89 |                 ev["expected_prefix"] = expected[1]
 90 |             else:
 91 |                 print(f"unknown expected_prefix: {ev}")
 92 | 
 93 |             pattern = r'Detected advertisement: (.+?/\d{1,2})'
 94 |             detected = re.search(pattern, detail_page)
 95 |             if detected is not None:
 96 |                 ev["detected_prefix"] = detected[1]
 97 |             else:
 98 |                 print(f"unknown detected_prefix: {ev}")
 99 | 
100 |         elif ev["event_type"] == "BGP Leak":
101 |             detail_page = get_page(urljoin(url_index, ev["moredetail"]))
102 |             pattern = r'Leaked prefix: (.+?/\d{1,2})'
103 |             leaked = re.search(pattern, detail_page)
104 |             if leaked is not None:
105 |                 ev["leaked_prefix"] = leaked[1]
106 |             else:
107 |                 print(f"unknown leaked_prefix: {ev}")
108 | 
109 |             pattern = r'Leaked To:\<br\>\s+\<li\>(\d+)'
110 |             leakedto = re.search(pattern, detail_page)
111 |             if leakedto is not None:
112 |                 ev["leaked_to"] = leakedto[1]
113 |             else:
114 |                 print(f"unknown leaked_to: {ev}")
115 | 
116 |     with ThreadPoolExecutor(max_workers=128) as executor:
117 |         executor.map(fetch_for_detail, events)
118 | 
119 |     f = open(CACHE_DIR/f"{ids[-1]}.jsonl", "w")
120 |     f.write("\n".join([json.dumps(ev) for ev in events])+"\n")
121 |     f.close()
122 | 
123 |     print(f"Update {len(events)} items")
124 |     print(f"Latest event_id: {ids[-1]}")
125 | 
126 | update_cache()
127 | 


--------------------------------------------------------------------------------
/anomaly_detector/report_anomaly_routeviews.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | from pathlib import Path
  5 | from joblib import Parallel, delayed
  6 | from concurrent.futures import ThreadPoolExecutor
  7 | from utils import approx_knee_point, event_aggregate
  8 | import json
  9 | import pandas as pd
 10 | import numpy as np
 11 | import click
 12 | 
 13 | repo_dir = Path(__file__).resolve().parent.parent
 14 | route_change_dir = None
 15 | beam_metric_dir = None
 16 | 
 17 | def load_monthly_data(year, month, preprocessor=lambda df: df):
 18 |     route_change_files = sorted(route_change_dir.glob(f"{year}{month:02d}*.csv"))
 19 |     beam_metric_files = sorted(beam_metric_dir.glob(f"{year}{month:02d}*.bm.csv"))
 20 |     datetimes = [i.stem.replace(".","")[:-2] for i in route_change_files]
 21 | 
 22 |     bulk_datetimes, bulk_indices = np.unique(datetimes, return_index=True)
 23 |     bulk_ranges = zip(bulk_indices, bulk_indices[1:].tolist()+[len(datetimes)])
 24 | 
 25 |     def load_one_bulk(i,j):
 26 |         rc_df = pd.concat(list(map(pd.read_csv, route_change_files[i:j])))
 27 |         bm_df = pd.concat(list(map(pd.read_csv, beam_metric_files[i:j])))
 28 |         return pd.concat([rc_df, bm_df], axis=1)
 29 | 
 30 |     with ThreadPoolExecutor(max_workers=4) as executor:
 31 |         bulks = list(executor.map(
 32 |                     lambda x: preprocessor(load_one_bulk(*x)), bulk_ranges))
 33 | 
 34 |     return bulk_datetimes, bulks
 35 | 
 36 | def metric_threshold(df, metric_col):
 37 |     values = df[metric_col]
 38 |     mu = np.mean(values)
 39 |     sigma = np.std(values)
 40 |     metric_th = mu+4*sigma
 41 | 
 42 |     print("reference metric: ")
 43 |     print(values.describe())
 44 |     print(f"metric threshold: {metric_th}")
 45 | 
 46 |     return metric_th
 47 | 
 48 | def forwarder_threshold(df, event_key):
 49 |     route_changes = tuple(df.groupby(event_key))
 50 |     forwarder_num = [len(j["forwarder"].unique()) for _, j in route_changes]
 51 |     forwarder_th, cdf = approx_knee_point(forwarder_num)
 52 | 
 53 |     print("reference forwarder: ")
 54 |     print(pd.Series(forwarder_num).describe())
 55 |     print(f"forwarder threshold: {forwarder_th}")
 56 | 
 57 |     return forwarder_th
 58 | 
 59 | def window(df0, df1, # df0 for reference, df1 for detection
 60 |         metric="diff", event_key=["prefix1", "prefix2"],
 61 |         dedup_index=["prefix1", "prefix2", "forwarder", "path1", "path2"]):
 62 | 
 63 |     if dedup_index is not None:
 64 |         df0 = df0.drop_duplicates(dedup_index, keep="first", inplace=False, ignore_index=True)
 65 | 
 66 |     with pd.option_context("mode.use_inf_as_na", True):
 67 |         df0 = df0.dropna(how="any")
 68 | 
 69 |     metric_th = metric_threshold(df0, metric)
 70 |     forwarder_th = forwarder_threshold(df0, event_key)
 71 | 
 72 |     events = {}
 73 |     for key,ev in tuple(df1.groupby(event_key)):
 74 |         if len(ev["forwarder"].unique()) <= forwarder_th: continue
 75 | 
 76 |         ev_sig = ev.sort_values(metric, ascending=False).drop_duplicates("forwarder")
 77 |         ev_anomaly = ev_sig.loc[ev_sig[metric]>metric_th]
 78 |         if ev_anomaly.shape[0] <= forwarder_th: continue
 79 | 
 80 |         events[key] = ev_anomaly
 81 | 
 82 |     if events:
 83 |         _, df = event_aggregate(events)
 84 |         n_alarms = len(df['group_id'].unique())
 85 |     else:
 86 |         df = None
 87 |         n_alarms = 0
 88 | 
 89 |     info = dict(
 90 |         metric=metric,
 91 |         event_key=event_key,
 92 |         metric_th=float(metric_th),
 93 |         forwarder_th=int(forwarder_th),
 94 |         n_raw_events=len(events),
 95 |         n_alarms=n_alarms,
 96 |     )
 97 | 
 98 |     return info, df
 99 | 
100 | @click.command()
101 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to detect anomalies")
102 | @click.option("--year", "-y", type=int, required=True, help="the year of the route changes monitored, e.g., 2024")
103 | @click.option("--month", "-m", type=int, required=True, help="the month of the route changes monitored, e.g., 8")
104 | def report_alarm_monthly(collector, year, month):
105 |     global route_change_dir, beam_metric_dir
106 |     collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector
107 |     route_change_dir = collector_result_dir/"route_change"
108 |     beam_metric_dir = collector_result_dir/"BEAM_metric"
109 |     reported_alarm_dir = collector_result_dir/"reported_alarms"/f"{year}{month:02d}"
110 |     reported_alarm_dir.mkdir(parents=True, exist_ok=True)
111 | 
112 |     def preprocessor(df):
113 |         df["diff_balance"] = df["diff"]/(df["path_d1"]+df["path_d2"])
114 |         return df
115 | 
116 |     datetimes, bulks = load_monthly_data(year, month, preprocessor)
117 |     indices = np.arange(len(bulks))
118 |     infos = []
119 | 
120 |     for i, j in list(zip(indices[:-1], indices[1:])):
121 |         info = dict(d0=datetimes[i], d1=datetimes[j])
122 |         _info, df = window(bulks[i], bulks[j], metric="diff_balance")
123 |         info.update(**_info)
124 | 
125 |         if df is None:
126 |             info.update(save_path=None)
127 |         else: 
128 |             save_path = reported_alarm_dir/f"{datetimes[i]}_{datetimes[j]}.alarms.csv"
129 |             df.to_csv(save_path, index=False)
130 |             info.update(save_path=str(save_path))
131 | 
132 |         infos.append(info)
133 | 
134 |     json.dump(infos, open(reported_alarm_dir/f"info_{year}{month:02d}.json", "w"), indent=2)
135 | 
136 | if __name__ == "__main__":
137 |     report_alarm_monthly()
138 | 


--------------------------------------------------------------------------------
/post_processor/irr_validator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | import ftplib
  5 | import shutil
  6 | from pathlib import Path
  7 | import gzip
  8 | import ipaddress
  9 | import re
 10 | 
 11 | script_dir = Path(__file__).resolve().parent
 12 | cache_dir = script_dir/"irr_cache"
 13 | cache_dir.mkdir(parents=True, exist_ok=True)
 14 | 
 15 | def fetch_and_uncompress_gz(year, month, day, domain="ftp.radb.net"):
 16 |         output_path = cache_dir/f"radb-{year}{month:02d}{day:02d}.db"
 17 | 
 18 |         if output_path.exists():
 19 |             return output_path
 20 | 
 21 |         temp_gz_file = output_path.with_suffix(output_path.suffix + ".gz")
 22 | 
 23 |         ftp = ftplib.FTP(domain)
 24 |         ftp.login()
 25 | 
 26 |         remote_paths = [
 27 |             f"/radb/dbase/archive/{year}/radb.db.{year}{month:02d}{day:02d}.gz",
 28 |             f"/radb/dbase/archive/{year}/radb.db.{str(year)[-2:]}{month:02d}{day:02d}.gz"
 29 |         ]
 30 | 
 31 |         for remote in remote_paths:
 32 |             try:
 33 |                 with temp_gz_file.open("wb") as file:
 34 |                     ftp.retrbinary(f"RETR {remote}", file.write)
 35 |                 break
 36 |             except ftplib.error_perm as e:
 37 |                 print(f"Remote file not found at {remote}. Trying the next path...")
 38 |             except Exception as e:
 39 |                 print(f"Unexpected error while accessing {remote}: {e}")
 40 |         else:
 41 |             raise FileNotFoundError("Remote files unavailable.")
 42 | 
 43 |         with gzip.open(temp_gz_file, "rb") as f_in, output_path.open("wb") as f_out:
 44 |             shutil.copyfileobj(f_in, f_out)
 45 | 
 46 |         temp_gz_file.unlink() 
 47 | 
 48 |         return output_path
 49 | 
 50 | def parse_route_blocks(file_path):
 51 |     with open(file_path, encoding='ISO-8859-1') as file:
 52 |         content = file.read()
 53 | 
 54 |     blocks = content.strip().split('\n\n')
 55 | 
 56 |     route_blocks = []
 57 |     for block in blocks:
 58 |         if block.startswith("route:"):
 59 |             route_dict = parse_route_block(block)
 60 |             assert route_dict
 61 |             route_dict["original_data"] = block
 62 |             route_blocks.append(route_dict)
 63 | 
 64 |     return route_blocks
 65 | 
 66 | def parse_route_block(block):
 67 |     block_dict = {}
 68 |     current_key = None
 69 | 
 70 |     for line in block.split('\n'):
 71 |         if not line: continue
 72 |         match = re.match(r'^(\S+):(.*)$', line)
 73 |         if match:
 74 |             current_key, value = match.groups()
 75 |         else:
 76 |             value = line # multi-line value
 77 |             assert current_key is not None
 78 |         block_dict.setdefault(current_key, []).append(value.strip())
 79 |     for k, v in block_dict.items():
 80 |         block_dict[k] = "\n".join(v)
 81 | 
 82 |     return block_dict
 83 | 
 84 | def sync_cache(year, month, day):
 85 |     route_objects = parse_route_blocks(fetch_and_uncompress_gz(year, month, day))
 86 |     return route_objects
 87 | 
 88 | class RADB:
 89 |     class PrefixNode:
 90 |         def __init__(self):
 91 |             self.left       = None
 92 |             self.right      = None
 93 |             self.data       = []
 94 | 
 95 |         def get_left(self):
 96 |             if self.left is None:
 97 |                 self.left = RADB.PrefixNode()
 98 |             return self.left
 99 | 
100 |         def get_right(self):
101 |             if self.right is None:
102 |                 self.right = RADB.PrefixNode()
103 |             return self.right
104 | 
105 |         def update_data(self, **kwargs):
106 |             self.data.append(kwargs)
107 | 
108 |     def __init__(self):
109 |         self.root = RADB.PrefixNode()
110 | 
111 |     def load_data(self, year, month, day):
112 |         route_objects = sync_cache(year, month, day)
113 |         for obj in route_objects:
114 |             if obj["route"][-2:] == "/0": continue
115 |             try:
116 |                 directions = self.prefix_to_dirs(obj["route"])
117 |             except:
118 |                 print(obj)
119 |                 exit()
120 |             if not directions: continue
121 |             self.create_node(directions).update_data(**obj)
122 |         return self
123 | 
124 |     @staticmethod
125 |     def prefix_to_dirs(prefix_str):
126 |         prefix = ipaddress.ip_network(prefix_str)
127 |         if prefix.version == 6: return None
128 |         prefixlen = prefix.prefixlen
129 |         prefix = int(prefix[0]) >> (32-prefixlen)
130 |         directions = [(prefix>>shift)&1
131 |                         for shift in range(prefixlen-1, -1, -1)]
132 |         return directions
133 | 
134 |     def create_node(self, directions):
135 |         n = self.root
136 |         for left in directions:
137 |             if left: n = n.get_left()
138 |             else: n = n.get_right()
139 |         return n
140 | 
141 |     def match_node(self, directions):
142 |         matched = None
143 |         n = self.root
144 |         for left in directions:
145 |             if left: n = n.get_left()
146 |             else: n = n.get_right()
147 |             if n is None: break
148 |             if n.data: matched = n
149 |         return matched
150 | 
151 |     def validate(self, prefix_str, asn_str):
152 |         directions = self.prefix_to_dirs(prefix_str)
153 | 
154 |         if not directions: return "Not Found"
155 | 
156 |         matched = self.match_node(directions) # longest match
157 | 
158 |         if matched is None: return "Not Found"
159 | 
160 |         irrs = matched.data
161 | 
162 |         for irr in irrs:
163 |             if f"AS{asn_str}" == irr["origin"]:
164 |                 return "Valid"
165 | 
166 |         return "Invalid"
167 | 
168 |     def all_matched(self, prefix_str):
169 |         directions = self.prefix_to_dirs(prefix_str)
170 | 
171 |         if not directions: return []
172 | 
173 |         matched = self.match_node(directions) # longest match
174 | 
175 |         if matched is None: return []
176 | 
177 |         return matched.data
178 | 


--------------------------------------------------------------------------------
/data/routeviews/fetch_rib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | from pathlib import Path
  5 | from io import StringIO
  6 | from urllib.parse import urljoin
  7 | import pandas as pd
  8 | import numpy as np
  9 | import subprocess
 10 | import re
 11 | import json
 12 | from datetime import datetime
 13 | from dateutil.relativedelta import relativedelta
 14 | 
 15 | SCRIPT_DIR = Path(__file__).resolve().parent
 16 | CACHE_DIR = SCRIPT_DIR/"cache"
 17 | CACHE_DIR.mkdir(parents=True, exist_ok=True)
 18 | 
 19 | current_ym = datetime.now().strftime("%Y.%m")
 20 | for cache_file in CACHE_DIR.glob(f"*{current_ym}*"): # remove incomplete cache files
 21 |     cache_file.unlink()
 22 | 
 23 | def get_all_collectors(url_index="http://routeviews.org/"):
 24 |     cache_path = CACHE_DIR/f"collectors2url.{url_index.replace('/', '+')}"
 25 |     if cache_path.exists():
 26 |         # print(f"load cache: {cache_path}")
 27 |         try: return json.load(open(cache_path, "r"))
 28 |         except: pass
 29 | 
 30 |     res = subprocess.check_output(["curl", "-s", url_index]).decode()
 31 |     res = re.sub(r"\s\s+", " ", res.replace("\n", " "))
 32 |     collectors2url = {}
 33 |     for a, b in re.findall(r'\<A HREF="(.+?)"\>.+?\([\w\s]+, from (.+?)\)', res):
 34 |         collector_name = b.split(".")[-3]
 35 |         if collector_name in collectors2url:
 36 |             idx = 2
 37 |             while f"{collector_name}{idx}" in collectors2url:
 38 |                 idx += 1
 39 |             collector_name = f"{collector_name}{idx}"
 40 |         collectors2url[collector_name] = urljoin(url_index, a) + "/"
 41 | 
 42 |     # print(f"save cache: {cache_path}")
 43 |     json.dump(collectors2url, open(cache_path, "w"), indent=2)
 44 |     return collectors2url
 45 | 
 46 | def get_most_recent_rib(collector, collectors2url, dtime):
 47 |     if collector not in collectors2url: return []
 48 | 
 49 |     def pull_list():
 50 |         target_url = urljoin(collectors2url[collector], f"{ym}{subdir}") + "/"
 51 |         cache_path = CACHE_DIR/f"archive_list.{target_url.replace('/', '+')}"
 52 |         if cache_path.exists():
 53 |             # print(f"load cache: {cache_path}")
 54 |             try: return target_url, json.load(open(cache_path, "r"))
 55 |             except: pass
 56 |         res = subprocess.check_output(["curl", "-s", target_url]).decode()
 57 |         archive_list = re.findall(
 58 |             r'\<a href="(.+?(\d{4}).??(\d{2}).??(\d{2}).??(\d{4}).*?\.bz2)"\>', res)
 59 |         # print(f"save cache: {cache_path}")
 60 |         json.dump(archive_list, open(cache_path, "w"), indent=2)
 61 |         return target_url, archive_list
 62 | 
 63 |     ym = dtime.strftime("%Y.%m")
 64 |     subdir = "/RIBS"
 65 |     target_url, archive_list = pull_list()
 66 | 
 67 |     if not archive_list:
 68 |         subdir = ""
 69 |         target_url, archive_list = pull_list()
 70 |     if not archive_list: return []
 71 |     
 72 |     time_list = ["".join(i[1:]) for i in archive_list]
 73 |     t = dtime.strftime("%Y%m%d%H%M")
 74 |     idx = np.searchsorted(time_list, t)
 75 | 
 76 |     if idx == 0:
 77 |         data1 = urljoin(target_url, archive_list[0][0])
 78 |         dtime = dtime-relativedelta(months=1)
 79 |         ym = dtime.strftime("%Y.%m")
 80 |         target_url, archive_list = pull_list()
 81 |         if not archive_list: return []
 82 |         data0 = urljoin(target_url, archive_list[-1][0])
 83 |         stime = datetime.strptime("".join(archive_list[-1][1:]), "%Y%m%d%H%M")
 84 |         return data0, data1, stime
 85 | 
 86 |     if idx == len(time_list):
 87 |         data0 = urljoin(target_url, archive_list[-1][0])
 88 |         stime = datetime.strptime("".join(archive_list[-1][1:]), "%Y%m%d%H%M")
 89 |         dtime = dtime+relativedelta(months=1)
 90 |         ym = dtime.strftime("%Y.%m")
 91 |         target_url, archive_list = pull_list()
 92 |         if not archive_list: return []
 93 |         data1 = urljoin(target_url, archive_list[0][0])
 94 |         return data0, data1, stime
 95 | 
 96 |     data0 = urljoin(target_url, archive_list[idx-1][0])
 97 |     data1 = urljoin(target_url, archive_list[idx][0])
 98 |     stime = datetime.strptime("".join(archive_list[idx-1][1:]), "%Y%m%d%H%M")
 99 |     return data0, data1, stime
100 | 
101 | def download_data(url, collector):
102 |     fname = url.split("/")[-1].strip()
103 |     outpath = SCRIPT_DIR / "ribs" / collector / fname
104 |     fpath = outpath.with_suffix("")
105 |     if fpath.exists():
106 |         # print(f"updates for {collector} {outpath.stem} already existed")
107 |         return fpath
108 |     outpath.parent.mkdir(exist_ok=True, parents=True)
109 |     subprocess.run(["curl", "-s", url, "--output", str(outpath)], check=True)
110 |     subprocess.run(["bzip2", "-d", str(outpath)], check=True)
111 |     print(f"get ribs for {collector} {outpath.stem}")
112 |     return fpath
113 | 
114 | def load_ribs_to_df(fpath):
115 |     if fpath.suffix == ".dat":
116 |         fd = open(fpath, "r")
117 |         l = fd.readline()
118 |         while l:
119 |             if "Network" in l and "Path" in l:
120 |                 idx_network = l.find("Network")
121 |                 idx_path = l.find("Path")
122 |                 break
123 |             l = fd.readline()
124 | 
125 |         data = []
126 |         current_network = ""
127 |         while l:
128 |             if l[0] != "*": l = fd.readline(); continue
129 |             if l[idx_network] != " ":
130 |                 current_network = l[idx_network:idx_network+l[idx_network:].find(" ")]
131 |             if l[1] == ">":
132 |                 path = l[idx_path:-3]
133 |                 if "/" in current_network:
134 |                     data.append(["0", current_network, path.split(" ")[0], path])
135 |             l = fd.readline()
136 |         df = pd.DataFrame(data, columns=["timestamp", "prefix", "peer-asn", "as-path"])
137 |     else:
138 |         bgpd = SCRIPT_DIR / 'bgpd'
139 |         res = subprocess.check_output([str(bgpd), "-q", "-m", "-u", str(fpath)]).decode()
140 |         fmt = "type|timestamp|A/W|peer-ip|peer-asn|prefix|as-path|origin-protocol|next-hop|local-pref|MED|community|atomic-agg|aggregator|unknown-field-1|unknown-field-2"
141 |         cols = fmt.split("|")
142 |         df = pd.read_csv(StringIO(res), sep="|", names=cols, usecols=cols[:-2], dtype=str, keep_default_na=False)
143 | 
144 |     return df
145 | 


--------------------------------------------------------------------------------
/data/routeviews/fetch_updates.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | from pathlib import Path
  5 | from io import StringIO
  6 | from urllib.parse import urljoin
  7 | from datetime import datetime
  8 | from dateutil.relativedelta import relativedelta
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | import pandas as pd
 11 | import numpy as np
 12 | import subprocess
 13 | import re
 14 | import json
 15 | import click
 16 | 
 17 | SCRIPT_DIR = Path(__file__).resolve().parent
 18 | CACHE_DIR = SCRIPT_DIR/"cache"
 19 | CACHE_DIR.mkdir(parents=True, exist_ok=True)
 20 | 
 21 | current_ym = datetime.now().strftime("%Y.%m")
 22 | for cache_file in CACHE_DIR.glob(f"*{current_ym}*"): # remove incomplete cache files
 23 |     cache_file.unlink()
 24 | 
 25 | def get_all_collectors(url_index="http://routeviews.org/"):
 26 |     cache_path = CACHE_DIR/f"collectors2url.{url_index.replace('/', '+')}"
 27 |     if cache_path.exists():
 28 |         # print(f"load cache: {cache_path}")
 29 |         try: return json.load(open(cache_path, "r"))
 30 |         except: pass
 31 | 
 32 |     res = subprocess.check_output(["curl", "-s", url_index]).decode()
 33 |     res = re.sub(r"\s\s+", " ", res.replace("\n", " "))
 34 |     collectors2url = {}
 35 |     for a, b in re.findall(r'\<A HREF="(.+?)"\>.+?\([\w\s]+, from (.+?)\)', res):
 36 |         collector_name = b.split(".")[-3]
 37 |         if collector_name in collectors2url:
 38 |             idx = 2
 39 |             while f"{collector_name}{idx}" in collectors2url:
 40 |                 idx += 1
 41 |             collector_name = f"{collector_name}{idx}"
 42 |         collectors2url[collector_name] = urljoin(url_index, a) + "/"
 43 | 
 44 |     # print(f"save cache: {cache_path}")
 45 |     json.dump(collectors2url, open(cache_path, "w"), indent=2)
 46 |     return collectors2url
 47 | 
 48 | def get_archive_list(collector, collectors2url, dtime1, dtime2):
 49 |     if collector not in collectors2url: return []
 50 | 
 51 |     def pull_list(ym):
 52 |         target_url = urljoin(collectors2url[collector], f"{ym}/UPDATES") + "/"
 53 |         cache_path = CACHE_DIR/f"archive_list.{target_url.replace('/', '+')}"
 54 |         if cache_path.exists():
 55 |             # print(f"load cache: {cache_path}")
 56 |             try: return target_url, json.load(open(cache_path, "r"))
 57 |             except: pass
 58 |         res = subprocess.check_output(["curl", "-s", target_url]).decode()
 59 |         archive_list = re.findall(
 60 |             r'\<a href="(.+?(\d{4}).??(\d{2}).??(\d{2}).??(\d{4}).*?\.bz2)"\>', res)
 61 |         # print(f"save cache: {cache_path}")
 62 |         json.dump(archive_list, open(cache_path, "w"), indent=2)
 63 |         return target_url, archive_list
 64 | 
 65 |     ym1 = dtime1.strftime("%Y.%m")
 66 |     ym2 = dtime2.strftime("%Y.%m")
 67 |     target_url1, archive_list1 = pull_list(ym1)
 68 |     target_url2, archive_list2 = pull_list(ym2)
 69 | 
 70 |     if not archive_list1 or not archive_list2:
 71 |         print(f"failed to get archive list: {dtime1} {dtime2}")
 72 |         exit(1)
 73 |     
 74 |     time_list1 = ["".join(i[1:]) for i in archive_list1]
 75 |     time_list2 = ["".join(i[1:]) for i in archive_list2]
 76 |     t1 = dtime1.strftime("%Y%m%d%H%M")
 77 |     t2 = dtime2.strftime("%Y%m%d%H%M")
 78 |     idx1 = np.searchsorted(time_list1, t1, side="left")
 79 |     idx2 = np.searchsorted(time_list2, t2, side="right")
 80 | 
 81 |     if time_list1 == time_list2:
 82 |         data = [urljoin(target_url1, i[0]) for i in archive_list1[idx1:idx2]]
 83 |     else:
 84 |         data = [urljoin(target_url1, i[0]) for i in archive_list1[idx1:]]
 85 | 
 86 |         current_month = datetime(dtime1.year, dtime1.month, 1)
 87 |         current_month += relativedelta(months=1)
 88 |         upper_bound = datetime(dtime2.year, dtime2.month, 1)
 89 |         while current_month < upper_bound:
 90 |             cur_ym = current_month.strftime("%Y.%m")
 91 |             cur_target_url, cur_archive_list = pull_list(cur_ym)
 92 |             data += [urljoin(cur_target_url, i[0]) for i in cur_archive_list]
 93 |             current_month += relativedelta(months=1)
 94 |         data += [urljoin(target_url2, i[0]) for i in archive_list2[:idx2]]
 95 | 
 96 |     return data
 97 | 
 98 | def download_data(url, collector):
 99 |     fname = url.split("/")[-1].strip()
100 |     outpath = SCRIPT_DIR / "updates" / collector / fname
101 |     fpath = outpath.with_suffix("")
102 |     if fpath.exists():
103 |         # print(f"updates for {collector} {outpath.stem} already existed")
104 |         return fpath
105 |     outpath.parent.mkdir(exist_ok=True, parents=True)
106 |     subprocess.run(["curl", "-s", url, "--output", str(outpath)], check=True)
107 |     subprocess.run(["bzip2", "-d", str(outpath)], check=True)
108 |     print(f"get updates for {collector} {outpath.stem}")
109 |     return fpath
110 | 
111 | def load_updates_to_df(fpath, bgpd=SCRIPT_DIR/"bgpd"):
112 |     res = subprocess.check_output([str(bgpd), "-q", "-m", "-u", str(fpath)]).decode()
113 |     fmt = "type|timestamp|A/W|peer-ip|peer-asn|prefix|as-path|origin-protocol|next-hop|local-pref|MED|community|atomic-agg|aggregator|unknown-field-1|unknown-field-2"
114 |     cols = fmt.split("|")
115 |     df = pd.read_csv(StringIO(res), sep="|", names=cols, usecols=cols[:-2], dtype=str, keep_default_na=False)
116 |     return df
117 | 
118 | 
119 | @click.command()
120 | @click.option("--collector", type=str, required=True, help="the collector name, e.g., route-views4")
121 | @click.option("--dtime1", type=str, required=True, help="the starttime (included), e.g., 201812312330")
122 | @click.option("--dtime2", type=str, required=True, help="the endtime (included), e.g., 201812312330")
123 | @click.option("--download", type=bool, default=False, help="download the archives")
124 | @click.option("--num-workers", type=int, default=1, help="number of workers")
125 | def main(collector, dtime1, dtime2, download, num_workers):
126 |     dtime1 = datetime.strptime(dtime1, "%Y%m%d%H%M")
127 |     dtime2 = datetime.strptime(dtime2, "%Y%m%d%H%M")
128 | 
129 |     collectors2url = get_all_collectors()
130 |     data = get_archive_list(collector, collectors2url, dtime1, dtime2)
131 |     # print(data)
132 | 
133 |     if download:
134 |         job = lambda url: download_data(url, collector)
135 |         with ThreadPoolExecutor(max_workers=num_workers) as executor:
136 |             executor.map(job, data)
137 | 
138 |         # for url in data:
139 |             # fpath = download_data(url, collector)
140 |             # print(fpath)
141 |             # df = load_updates_to_df(fpath)
142 |             # print(df)
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/data/bgpstream/locate_route_change.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | from pathlib import Path
  5 | from datetime import datetime, timedelta
  6 | import json
  7 | import numpy as np
  8 | import pandas as pd
  9 | import ipaddress
 10 | from joblib import Parallel, delayed
 11 | 
 12 | import sys
 13 | sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 14 | 
 15 | from data.routeviews.fetch_updates import download_data, load_updates_to_df, get_all_collectors, get_archive_list as get_updates_list
 16 | from data.routeviews.fetch_rib import load_ribs_to_df, get_most_recent_rib
 17 | from routing_monitor.monitor import Monitor
 18 | 
 19 | SCRIPT_DIR = Path(__file__).resolve().parent
 20 | CACHE_DIR = SCRIPT_DIR/"cache"
 21 | EVENT_DIR = SCRIPT_DIR/"event"
 22 | EVENT_DIR.mkdir(parents=True, exist_ok=True)
 23 | 
 24 | collectors2url = get_all_collectors()
 25 | 
 26 | class RibMonitor(Monitor):
 27 |     def __init__(self, rib_df, checker):
 28 |        super().__init__()
 29 |        self.checker = checker
 30 |        self.consume(rib_df, detect=False)
 31 | 
 32 |     def update(self, timestamp, prefix_str, vantage_point, aspath_str, detect):
 33 |         prefix = ipaddress.ip_network(prefix_str)
 34 | 
 35 |         if prefix.version == 6:
 36 |             prefixlen = prefix.prefixlen
 37 |             prefix = int(prefix[0]) >> (128-prefixlen)
 38 |         else:
 39 |             prefixlen = prefix.prefixlen
 40 |             prefix = int(prefix[0]) >> (32-prefixlen)
 41 | 
 42 |         aspath = aspath_str.split(" ")
 43 |         forwarder = aspath[0] # forwarder could be vantage point or not
 44 | 
 45 |         n = self.root
 46 |         original_route = None
 47 |         for shift in range(prefixlen-1, -1, -1): # find the original route
 48 |             left = (prefix >> shift) & 1
 49 | 
 50 |             if left: n = n.get_left()
 51 |             else: n = n.get_right()
 52 |             
 53 |             if n.find_route(forwarder) is not None:
 54 |                 original_route = [shift, n.find_route(forwarder)]
 55 | 
 56 |         if detect and original_route is not None:
 57 |             shift, original_path = original_route
 58 |             vict_prefix = ipaddress.ip_network(prefix_str) \
 59 |                             .supernet(new_prefix=prefixlen-shift)
 60 |             if aspath != original_path:
 61 |                 route_change = {
 62 |                     "timestamp"    : timestamp,
 63 |                     "vantage_point": vantage_point,
 64 |                     "forwarder"    : forwarder,
 65 |                     "prefix1"      : str(vict_prefix),
 66 |                     "prefix2"      : prefix_str,
 67 |                     "path1"        : " ".join(original_path),
 68 |                     "path2"        : " ".join(aspath),
 69 |                 }
 70 |                 if self.checker(route_change):
 71 |                     self.route_changes.append(route_change)
 72 | 
 73 |         n.routes[forwarder] = aspath
 74 | 
 75 | def get_event_list():
 76 |     existing_ids = [int(i.stem) for i in EVENT_DIR.glob("*.csv")]
 77 |     last_id = max(existing_ids) if existing_ids else -1
 78 | 
 79 |     events = [json.loads(l.strip()) for jl in CACHE_DIR.glob("*.jsonl")
 80 |                                         for l in open(jl, "r").readlines()]
 81 |     events = [ev for ev in events if ev["event_type"] != "Outage"] # NOTE: ignore outage for now
 82 |     events = np.array(events)
 83 |     ids = np.array([int(i["event_id"]) for i in events])
 84 |     sort_idx = np.argsort(ids)
 85 |     events = events[sort_idx].tolist()
 86 |     ids = ids[sort_idx].tolist()
 87 | 
 88 |     start_idx = np.searchsorted(ids, last_id, "right")
 89 |     return events[start_idx:]
 90 | 
 91 | def process_event(collector, event):
 92 |     if event["event_type"] == "Outage" and len(event["asn"]) != 1:
 93 |         return # ignore the country-wide outage
 94 | 
 95 |     output_dir = EVENT_DIR/event["event_id"]
 96 |     if (output_dir/f"{collector}.csv").exists():
 97 |         return
 98 | 
 99 |     dtime1 = datetime.strptime(event["starttime"], "%Y-%m-%d %H:%M:%S")
100 |     dtime2 = datetime.strptime(event["endtime"], "%Y-%m-%d %H:%M:%S") \
101 |                         if event["endtime"] else dtime1+timedelta(hours=1)
102 | 
103 |     rib, _, stime = get_most_recent_rib(collector, collectors2url, dtime1)
104 |     update_list = get_updates_list(collector, collectors2url, stime, dtime2)
105 | 
106 |     rib_fpath = download_data(rib, collector)
107 |     update_fpaths = [download_data(url, collector) for url in update_list]
108 | 
109 |     def outage_process(event):
110 |         target_asn, = event["asn"] # assert single asn here
111 |         def checker(route_change):
112 |             p1 = route_change["path1"].split(" ")
113 |             p2 = route_change["path2"].split(" ")
114 |             return ((target_asn in p1) and (target_asn not in p2)) \
115 |                         or ((target_asn not in p1) and (target_asn in p2))
116 |         def locator(df):
117 |             return df
118 |         return checker, locator
119 | 
120 |     def hijack_process(event):
121 |         expected_asn = event["expected_asn"]
122 |         detected_asn = event["detected_asn"]
123 |         def checker(route_change):
124 |             o1 = route_change["path1"].split(" ")[-1]
125 |             o2 = route_change["path2"].split(" ")[-1]
126 |             return (expected_asn == o1 and detected_asn == o2) \
127 |                     or (detected_asn == o1 and expected_asn == o2)
128 |         expected_prefix = event["expected_prefix"]
129 |         detected_prefix = event["detected_prefix"]
130 |         def locator(df):
131 |             return df.loc[(df["prefix"] == expected_prefix)
132 |                             | (df["prefix"] == detected_prefix)]
133 |         return checker, locator
134 | 
135 |     def leak_process(event):
136 |         origin = event["origin_asn"]
137 |         leaker = event["leaker_asn"]
138 |         leakedto = event["leaked_to"]
139 |         def checker(route_change):
140 |             p1 = route_change["path1"].split(" ")
141 |             p2 = route_change["path2"].split(" ")
142 |             return origin == p1[-1] and origin == p2[-1] \
143 |                             and ((leaker in p1 and leakedto in p1)
144 |                                     or (leaker in p2 and leakedto in p2))
145 |         leaked_prefix = event["leaked_prefix"]
146 |         def locator(df):
147 |             return df.loc[df["prefix"] == leaked_prefix]
148 |         return checker, locator
149 | 
150 |     if event["event_type"] == "Outage":
151 |         checker, locator = outage_process(event)
152 |     elif event["event_type"] == "Possible Hijack":
153 |         checker, locator = hijack_process(event)
154 |     elif event["event_type"] == "BGP Leak":
155 |         checker, locator = leak_process(event)
156 | 
157 |     rib_df = locator(load_ribs_to_df(rib_fpath))
158 | 
159 |     mon = RibMonitor(rib_df, checker)
160 |     for fp in update_fpaths:
161 |         df = locator(load_updates_to_df(fp))
162 |         mon.consume(df, detect=True)
163 | 
164 |     route_change_df = pd.DataFrame.from_records(mon.route_changes)
165 | 
166 |     output_dir.mkdir(exist_ok=True, parents=True)
167 |     route_change_df.to_csv(output_dir/f"{collector}.csv", index=False)
168 | 
169 | events = get_event_list()
170 | np.random.shuffle(events)
171 | 
172 | def process_event_safe(collector, event):
173 |     try: process_event(collector, event)
174 |     except Exception as e:
175 |         print(f"{e} at {collector} {event}")
176 | 
177 | Parallel(n_jobs=12, backend="multiprocessing", verbose=10)(
178 |         delayed(process_event_safe)("route-views4", ev) for ev in events)
179 | 
180 | # for ev in events:
181 | #     process_event("route-views4", ev)
182 | #     print("done")
183 | #     input()
184 | 


--------------------------------------------------------------------------------
/anomaly_detector/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import json
  4 | from pathlib import Path
  5 | from functools import lru_cache
  6 | from scipy.special import softmax
  7 | from itertools import chain
  8 | from ipaddress import IPv4Network
  9 | import pickle
 10 | 
 11 | def read_csv_empty(*args, **kwargs):
 12 |     try: return pd.read_csv(*args, **kwargs)
 13 |     except pd.errors.EmptyDataError: return pd.DataFrame()
 14 | 
 15 | def approx_knee_point(x):
 16 |     x, y = np.unique(x, return_counts=True)
 17 |     _x = (x-x.min())/(x.max()-x.min())
 18 |     _y = y.cumsum()/y.sum()
 19 |     idx = np.argmax(np.abs(_y-_x))
 20 |     return x[idx], _y[idx]
 21 | 
 22 | def load_emb_distance(train_dir, return_emb=False):
 23 |     train_dir = Path(train_dir)
 24 | 
 25 |     node_emb_path = train_dir / "node.emb"
 26 |     link_emb_path = train_dir / "link.emb"
 27 |     rela_emb_path = train_dir / "rela.emb"
 28 | 
 29 |     node_emb = pickle.load(open(node_emb_path, "rb"))
 30 |     link_emb = pickle.load(open(link_emb_path, "rb"))
 31 |     rela_emb = pickle.load(open(rela_emb_path, "rb"))
 32 |     rela = rela_emb["p2c"]
 33 |     link = link_emb["p2c"]
 34 |     link = softmax(link)
 35 | 
 36 |     @lru_cache(maxsize=100000)
 37 |     def _emb_distance(a, b): # could be cluster-like, e.g. '{123,456}'
 38 |         a = a.strip("{}").split(",")[0]
 39 |         b = b.strip("{}").split(",")[0]
 40 |         if a == b: return 0.
 41 |         if a not in node_emb or b not in node_emb:
 42 |             return np.inf
 43 |         xi = node_emb[a]
 44 |         xj = node_emb[b]
 45 |         return np.sum((xj-xi)**2*link) + np.abs(np.sum((xj-xi)*rela))
 46 | 
 47 |     def emb_distance(a, b):
 48 |         return _emb_distance(str(a), str(b))
 49 | 
 50 |     @lru_cache(maxsize=100000)
 51 |     def _dtw_distance(s, t):
 52 |         s = [v for i,v in enumerate(s) if i == 0 or v != s[i-1]]
 53 |         t = [v for i,v in enumerate(t) if i == 0 or v != t[i-1]]
 54 |         ls, lt = len(s), len(t)
 55 |         DTW = np.full((ls+1, lt+1), np.inf)
 56 |         DTW[0,0] = 0.
 57 |         for i in range(ls):
 58 |             for j in range(lt):
 59 |                 cost = emb_distance(s[i], t[j])
 60 |                 DTW[i+1, j+1] = cost + min(DTW[i  , j+1],
 61 |                                            DTW[i+1, j  ],
 62 |                                            DTW[i  , j  ])
 63 |         return DTW[ls, lt]
 64 | 
 65 |     def dtw_distance(s, t):
 66 |         return _dtw_distance(tuple(s), tuple(t))
 67 | 
 68 |     @lru_cache(maxsize=100000)
 69 |     def _path_emb_length(s):
 70 |         d = np.array([emb_distance(a,b) for a,b in zip(s[:-1], s[1:])])
 71 |         d = d[(d > 0) & (d < np.inf)]
 72 |         return np.nan if d.size == 0 else d.sum()
 73 | 
 74 |     def path_emb_length(s):
 75 |         return _path_emb_length(tuple(s))
 76 | 
 77 |     if return_emb:
 78 |         return emb_distance, dtw_distance, path_emb_length, node_emb, link, rela
 79 | 
 80 |     return emb_distance, dtw_distance, path_emb_length
 81 | 
 82 | def root_cause_localize_2set(df, th=0.95):
 83 |     set1_asn_cnt, set2_asn_cnt = {}, {}
 84 |     for i,j in df[["path1", "path2"]].values:
 85 |         set_i = set(i.split(" "))
 86 |         set_j = set(j.split(" "))
 87 |         set_ij = set_i - set_j
 88 |         set_ji = set_j - set_i
 89 |         for asn in set_ij:
 90 |             if asn not in set1_asn_cnt: set1_asn_cnt[asn] = 1
 91 |             else: set1_asn_cnt[asn] += 1
 92 |         for asn in set_ji:
 93 |             if asn not in set2_asn_cnt: set2_asn_cnt[asn] = 1
 94 |             else: set2_asn_cnt[asn] += 1
 95 | 
 96 |     set1, cnt1 = list(set1_asn_cnt.keys()), list(set1_asn_cnt.values())
 97 |     idx1 = np.argsort(cnt1)[::-1]
 98 |     set1 = np.array(set1)[idx1]
 99 |     cnt1 = np.array(cnt1)[idx1]
100 | 
101 |     set2, cnt2 = list(set2_asn_cnt.keys()), list(set2_asn_cnt.values())
102 |     idx2 = np.argsort(cnt2)[::-1]
103 |     set2 = np.array(set2)[idx2]
104 |     cnt2 = np.array(cnt2)[idx2]
105 |    
106 |     rc_1, rc_2 = [], []
107 |     for a,b in zip(set1, cnt1):
108 |         if b/df.shape[0] > th: rc_1.append(a)
109 |     for a,b in zip(set2, cnt2):
110 |         if b/df.shape[0] > th: rc_2.append(a)
111 | 
112 |     return sorted(rc_1), sorted(rc_2)
113 | 
114 | def root_cause_localize_1set(df, th=0.95):
115 |     set_asn_cnt = {}
116 |     for i,j in df[["path1", "path2"]].values:
117 |         set_i = set(i.split(" "))
118 |         set_j = set(j.split(" "))
119 |         set_xor = set_i^set_j
120 |         for asn in set_xor:
121 |             if asn not in set_asn_cnt: set_asn_cnt[asn] = 1
122 |             else: set_asn_cnt[asn] += 1
123 | 
124 |     set_asn, cnt = list(set_asn_cnt.keys()), list(set_asn_cnt.values())
125 |     idx = np.argsort(cnt)[::-1]
126 |     set_asn = np.array(set_asn)[idx]
127 |     cnt = np.array(cnt)[idx]
128 | 
129 |     rc = []
130 |     for a,b in zip(set_asn, cnt):
131 |         if b/df.shape[0] > th: rc.append(a)
132 | 
133 |     return sorted(rc)
134 | 
135 | 
136 | def link_root_cause(culprit_to_df):
137 |     rcs = list(culprit_to_df.keys())
138 |     dfs = list(culprit_to_df.values())
139 | 
140 |     def rc_to_set(rc):
141 |         culprit_type, culprit_tuple = rc
142 |         assert culprit_type in ["Prefix", "AS"]
143 |         if culprit_type == "AS":
144 |             culprit_set = set(chain(*culprit_tuple))
145 |         else: # must be "Prefix"
146 |             culprit_set = {IPv4Network(p) for p in culprit_tuple}
147 |         return culprit_type, culprit_set
148 | 
149 |     def rc_set_related(rc1, rc2):
150 |         t1, set1 = rc1
151 |         t2, set2 = rc2
152 |         if t1 != t2:
153 |             return False
154 |         if t1 == "AS":
155 |             return set1&set2
156 |         else: # t1 and t2 must be "Prefix"
157 |             for i in set1:
158 |                 for j in set2:
159 |                     if i.overlaps(j): # check if they overlap
160 |                         return True
161 |                     if i.prefixlen == j.prefixlen: # check if they're two consecutive prefixes
162 |                         return abs((int(i[0])>>(32-i.prefixlen))
163 |                                 -(int(j[0])>>(32-j.prefixlen))) <= 1
164 |             return False
165 | 
166 |     pool = list(map(rc_to_set, rcs))
167 |     group_id = [-1]*len(culprit_to_df)
168 |     id_group = dict()
169 |     next_id = 0
170 |     for i in range(len(culprit_to_df)):
171 |         if group_id[i] == -1: 
172 |             group_id[i] = next_id
173 |             next_id += 1
174 |             id_group[group_id[i]] = [i]
175 |         for j in range(i+1, len(culprit_to_df)):
176 |             if group_id[j] == group_id[i]: continue
177 |             if rc_set_related(pool[i], pool[j]):
178 |                 if group_id[j] == -1:
179 |                     group_id[j] = group_id[i]
180 |                     id_group[group_id[i]].append(j)
181 |                 else:
182 |                     to_be_merged = id_group.pop(group_id[j])
183 |                     id_group[group_id[i]] += to_be_merged
184 |                     for k in to_be_merged: group_id[k] = group_id[i]
185 |     group_id_set = set(group_id)
186 |     group_id_remapping = dict(zip(group_id_set, range(len(group_id_set))))
187 |     for idx, df in enumerate(dfs):
188 |         df["group_id"] = group_id_remapping[group_id[idx]]
189 |     return id_group, pd.concat(dfs, ignore_index=True)
190 | 
191 | def event_aggregate(events):
192 |     culprit2eventkey = {}
193 |     eventkey2culprit = {}
194 | 
195 |     for k,v in events.items():
196 |         rc_1, rc_2 = root_cause_localize_2set(v)
197 |         rc_3 = root_cause_localize_1set(v)
198 |         if rc_1 or rc_2:
199 |             culprit = "AS", (tuple(rc_1), tuple(rc_2))
200 |         elif rc_3:
201 |             culprit = "AS", (tuple(rc_3),)
202 |         else:
203 |             culprit = "Prefix", k
204 |         culprit2eventkey.setdefault(culprit, set()).add(k)
205 |         eventkey2culprit[k] = culprit
206 | 
207 |     culprit_to_df = {k: pd.concat([events[i] for i in v])
208 |                                         for k, v in culprit2eventkey.items()}
209 |     for k, v in culprit_to_df.items():
210 |         _, culprit_tuple = k
211 |         v["culprit"] = json.dumps(culprit_tuple)
212 |     rc_groups, df = link_root_cause(culprit_to_df)
213 | 
214 |     return rc_groups, df
215 | 


--------------------------------------------------------------------------------
/BEAM_engine/BEAM_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional
  4 | import torch.utils.data
  5 | import pickle
  6 | import time
  7 | from itertools import cycle
  8 | from pathlib import Path
  9 | 
 10 | class Analyzer(torch.utils.data.Dataset):
 11 | 
 12 |     def __init__(self, Q):
 13 |         super(Analyzer, self).__init__()
 14 |         self.Q = Q
 15 |         self.edge_list = []
 16 | 
 17 |         self.asn_list = []
 18 |         self.asn2idx = {}
 19 | 
 20 |         self.downstreams = []
 21 |         self.upstreams = []
 22 | 
 23 |     def read_edge_file(self, edge_file):
 24 |         def get_index(asn):
 25 |             if asn not in self.asn2idx:
 26 |                 self.asn2idx[asn] = len(self.asn_list)
 27 |                 self.asn_list.append(asn)
 28 |                 self.downstreams.append(set())
 29 |                 self.upstreams.append(set())
 30 |             return self.asn2idx[asn]
 31 | 
 32 |         for line in open(edge_file, "r"):
 33 |             if line[0] == "#": continue
 34 |             i, j, k = line.strip().split("|")[:3]
 35 | 
 36 |             index_i = get_index(i)
 37 |             index_j = get_index(j)
 38 | 
 39 |             assert index_i != index_j
 40 | 
 41 |             if k == "0":
 42 |                 self.edge_list.append((index_i, index_j))
 43 |                 self.downstreams[index_i].add(index_j)
 44 |                 self.upstreams[index_j].add(index_i)
 45 | 
 46 |                 self.edge_list.append((index_j, index_i))
 47 |                 self.downstreams[index_j].add(index_i)
 48 |                 self.upstreams[index_i].add(index_j)
 49 |             elif k == "-1":
 50 |                 self.edge_list.append((index_i, index_j))
 51 |                 self.downstreams[index_i].add(index_j)
 52 |                 self.upstreams[index_j].add(index_i)
 53 |             else:
 54 |                 raise RuntimeError(f"unexpected rel {rel}")
 55 | 
 56 |         print(f"nodes: {len(self.asn_list)}")
 57 |         print(f"edges: {len(self.edge_list)}")
 58 | 
 59 |         self.init_sample_method()
 60 | 
 61 |         return self
 62 | 
 63 |     def init_sample_method(self, eps=0.01):
 64 |         upstreams = self.upstreams
 65 |         downstreams = self.downstreams
 66 | 
 67 |         global_cycler = cycle(range(len(self.asn_list)))
 68 |         none_cycler = cycle([None])
 69 | 
 70 |         # providers as tails, thus negative samples
 71 |         negative_tails = [cycle(u-d) if u-d else none_cycler
 72 |                             for u,d in zip(upstreams, downstreams)]
 73 | 
 74 |         # customers as heads, thus negative samples
 75 |         negative_heads = [cycle(d-u) if d-u else none_cycler
 76 |                             for u,d in zip(upstreams, downstreams)]
 77 | 
 78 |         def get_local_tail_negative(head):
 79 |             return next(negative_tails[head])
 80 | 
 81 |         def get_local_head_negative(tail):
 82 |             return next(negative_heads[tail])
 83 | 
 84 |         def get_global_tail_negative(head):
 85 |             for tail_negative in global_cycler:
 86 |                 if tail_negative != head and tail_negative not in downstreams[head]:
 87 |                     return head, tail_negative
 88 | 
 89 |         def get_global_head_negative(tail):
 90 |             for head_negative in global_cycler:
 91 |                 if head_negative != tail and head_negative not in upstreams[tail]:
 92 |                     return head_negative, tail
 93 | 
 94 |         bound1 = 0.5-eps
 95 |         bound2 = 0.5+eps
 96 | 
 97 |         def draw_negative_sample(head, tail):
 98 |             r = np.random.random()
 99 |             if r < bound1: # try corrupt tail
100 |                 tail_negative = get_local_tail_negative(head)
101 |                 if tail_negative:
102 |                     sample = (head, tail_negative)
103 |                 else: # try corrupt head
104 |                     head_negative = get_local_head_negative(tail)
105 |                     if head_negative:
106 |                         sample = (head_negative, tail)
107 |                     else: # global negative sample
108 |                         sample = get_global_tail_negative(head)
109 | 
110 |             elif r > bound2: # try corrupt head
111 |                 head_negative = get_local_head_negative(tail)
112 |                 if head_negative:
113 |                     sample = (head_negative, tail)
114 |                 else: # try corrupt tail
115 |                     tail_negative = get_local_tail_negative(head)
116 |                     if tail_negative:
117 |                         sample = (head, tail_negative)
118 |                     else: # global negative sample
119 |                         sample = get_global_head_negative(tail)
120 | 
121 |             else: # global negative sample
122 |                 if r < 0.5:
123 |                     sample = get_global_head_negative(tail)
124 |                 else:
125 |                     sample = get_global_tail_negative(head)
126 | 
127 |             return sample
128 | 
129 |         self.draw_negative_sample = draw_negative_sample
130 | 
131 |     def __len__(self):
132 |         return len(self.edge_list) * self.Q
133 | 
134 |     def __getitem__(self, index):
135 |         positive_sample = self.edge_list[index // self.Q]
136 |         negative_sample = self.draw_negative_sample(*positive_sample)
137 |         input_vector = [0, *positive_sample, *negative_sample]
138 |         return torch.tensor(input_vector, dtype=torch.int64, requires_grad=False)
139 | 
140 | 
141 | class BEAM(torch.nn.Module):
142 | 
143 |     def __init__(self, edge_file, Q=5, dimension=128, train_dir=Path("./"), cuda_device='cuda', num_workers=20):
144 |         super(BEAM, self).__init__()
145 | 
146 |         self.use_cuda = torch.cuda.is_available()
147 |         self.device = torch.device(cuda_device if self.use_cuda else 'cpu')
148 |         print("device: {}".format(self.device))
149 | 
150 |         self.train_dir = train_dir
151 | 
152 |         self.analyzer = Analyzer(Q).read_edge_file(edge_file)
153 |         self.node_embedding = torch.nn.Embedding(
154 |                                 len(self.analyzer.asn_list), dimension)
155 |         self.rela_embedding = torch.nn.Embedding(1, dimension)
156 |         self.link_embedding = torch.nn.Embedding(1, dimension)
157 | 
158 |         self.num_workers = num_workers
159 | 
160 |     def forward(self, batchVector):
161 |         idx_k = batchVector[:,0]
162 |         link = torch.nn.functional.softmax(self.link_embedding(idx_k), dim=1)
163 |         rela = self.rela_embedding(idx_k)
164 |         pi = self.node_embedding(batchVector[:,1])
165 |         pj = self.node_embedding(batchVector[:,2])
166 |         ni = self.node_embedding(batchVector[:,3])
167 |         nj = self.node_embedding(batchVector[:,4])
168 | 
169 |         # softplus(corrupt - correct)
170 |         relaError = torch.sum((nj-ni-pj+pi)*rela, dim=1) # criteria 2
171 |         linkError = torch.sum((pj-pi-nj+ni)*(pj-pi+nj-ni)*link, dim=1)
172 |         loss = torch.nn.functional.softplus(relaError + linkError)
173 | 
174 |         return loss
175 | 
176 |     def train(self, epoches=500):
177 |         self.to(device=self.device)
178 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=0)
179 |         generator = torch.utils.data.DataLoader(
180 |                         self.analyzer, batch_size=1024, shuffle=True,
181 |                         num_workers=self.num_workers)
182 | 
183 |         for epoch in range(1, epoches + 1):
184 |             if epoch%100 == 0:
185 |                 torch.save({
186 |                     "epoch": epoch,
187 |                     "model_state_dict": self.state_dict(),
188 |                     "optimizer_state_dict": optimizer.state_dict(),
189 |                 }, self.train_dir / "checkpoint")
190 |             loss = 0.0
191 |             tik = time.time()
192 |             for batchData in generator:
193 |                 optimizer.zero_grad()
194 |                 batchData = batchData.to(device=self.device)
195 |                 batchLoss = self(batchData).sum()
196 |                 loss += float(batchLoss)
197 |                 batchLoss.backward()
198 |                 optimizer.step()
199 |             tok = time.time()
200 |             print(f"Epoch{epoch:4d}/{epoches} Loss: {loss:e} Time: {tok-tik:.1f}s")
201 | 
202 |     def save_embeddings(self, path='.'):
203 |         print("save embeddings...")
204 |         path = Path(path)
205 |         node_keys = self.analyzer.asn_list
206 |         rela_keys = ["p2c"]
207 |         link_keys = ["p2c"]
208 | 
209 |         def dump_embedding(keys, tensor, filePath):
210 |             if self.use_cuda:
211 |                 emb = tensor.weight.cpu().data.numpy()
212 |             else:
213 |                 emb = tensor.weight.data.numpy()
214 |             emb = dict(zip(keys, emb))
215 |             pickle.dump(emb, open(filePath, 'wb'))
216 | 
217 |         dump_embedding(node_keys, self.node_embedding, path/'node.emb')
218 |         dump_embedding(rela_keys, self.rela_embedding, path/'rela.emb')
219 |         dump_embedding(link_keys, self.link_embedding, path/'link.emb')
220 | 


--------------------------------------------------------------------------------
/post_processor/html/template_routeviews.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <title>REPLACE_WITH_TITLE</title>
  5 | <meta http-equiv="Content-type" content="text/html;charset=UTF-8">
  6 | 
  7 | <style>
  8 | /* https://github.com/microsoft/vscode/blob/master/extensions/markdown-language-features/media/markdown.css */
  9 | /*---------------------------------------------------------------------------------------------
 10 |  *  Copyright (c) Microsoft Corporation. All rights reserved.
 11 |  *  Licensed under the MIT License. See License.txt in the project root for license information.
 12 |  *--------------------------------------------------------------------------------------------*/
 13 | 
 14 | body {
 15 | 	font-family: var(--vscode-markdown-font-family, -apple-system, BlinkMacSystemFont, "Segoe WPC", "Segoe UI", "Ubuntu", "Droid Sans", sans-serif);
 16 | 	font-size: var(--vscode-markdown-font-size, 14px);
 17 | 	padding: 0 26px;
 18 | 	line-height: var(--vscode-markdown-line-height, 22px);
 19 | 	word-wrap: break-word;
 20 | }
 21 | 
 22 | #code-csp-warning {
 23 | 	position: fixed;
 24 | 	top: 0;
 25 | 	right: 0;
 26 | 	color: white;
 27 | 	margin: 16px;
 28 | 	text-align: center;
 29 | 	font-size: 12px;
 30 | 	font-family: sans-serif;
 31 | 	background-color:#444444;
 32 | 	cursor: pointer;
 33 | 	padding: 6px;
 34 | 	box-shadow: 1px 1px 1px rgba(0,0,0,.25);
 35 | }
 36 | 
 37 | #code-csp-warning:hover {
 38 | 	text-decoration: none;
 39 | 	background-color:#007acc;
 40 | 	box-shadow: 2px 2px 2px rgba(0,0,0,.25);
 41 | }
 42 | 
 43 | body.scrollBeyondLastLine {
 44 | 	margin-bottom: calc(100vh - 22px);
 45 | }
 46 | 
 47 | body.showEditorSelection .code-line {
 48 | 	position: relative;
 49 | }
 50 | 
 51 | body.showEditorSelection .code-active-line:before,
 52 | body.showEditorSelection .code-line:hover:before {
 53 | 	content: "";
 54 | 	display: block;
 55 | 	position: absolute;
 56 | 	top: 0;
 57 | 	left: -12px;
 58 | 	height: 100%;
 59 | }
 60 | 
 61 | body.showEditorSelection li.code-active-line:before,
 62 | body.showEditorSelection li.code-line:hover:before {
 63 | 	left: -30px;
 64 | }
 65 | 
 66 | .vscode-light.showEditorSelection .code-active-line:before {
 67 | 	border-left: 3px solid rgba(0, 0, 0, 0.15);
 68 | }
 69 | 
 70 | .vscode-light.showEditorSelection .code-line:hover:before {
 71 | 	border-left: 3px solid rgba(0, 0, 0, 0.40);
 72 | }
 73 | 
 74 | .vscode-light.showEditorSelection .code-line .code-line:hover:before {
 75 | 	border-left: none;
 76 | }
 77 | 
 78 | .vscode-dark.showEditorSelection .code-active-line:before {
 79 | 	border-left: 3px solid rgba(255, 255, 255, 0.4);
 80 | }
 81 | 
 82 | .vscode-dark.showEditorSelection .code-line:hover:before {
 83 | 	border-left: 3px solid rgba(255, 255, 255, 0.60);
 84 | }
 85 | 
 86 | .vscode-dark.showEditorSelection .code-line .code-line:hover:before {
 87 | 	border-left: none;
 88 | }
 89 | 
 90 | .vscode-high-contrast.showEditorSelection .code-active-line:before {
 91 | 	border-left: 3px solid rgba(255, 160, 0, 0.7);
 92 | }
 93 | 
 94 | .vscode-high-contrast.showEditorSelection .code-line:hover:before {
 95 | 	border-left: 3px solid rgba(255, 160, 0, 1);
 96 | }
 97 | 
 98 | .vscode-high-contrast.showEditorSelection .code-line .code-line:hover:before {
 99 | 	border-left: none;
100 | }
101 | 
102 | img {
103 | 	max-width: 100%;
104 | 	max-height: 100%;
105 | }
106 | 
107 | a {
108 | 	text-decoration: none;
109 | }
110 | 
111 | a:hover {
112 | 	text-decoration: underline;
113 | }
114 | 
115 | a:focus,
116 | input:focus,
117 | select:focus,
118 | textarea:focus {
119 | 	outline: 1px solid -webkit-focus-ring-color;
120 | 	outline-offset: -1px;
121 | }
122 | 
123 | hr {
124 | 	border: 0;
125 | 	height: 2px;
126 | 	border-bottom: 2px solid;
127 | }
128 | 
129 | h1 {
130 | 	padding-bottom: 0.3em;
131 | 	line-height: 1.2;
132 | 	border-bottom-width: 1px;
133 | 	border-bottom-style: solid;
134 | }
135 | 
136 | h1, h2, h3 {
137 | 	font-weight: normal;
138 | }
139 | 
140 | table {
141 | 	border-collapse: collapse;
142 | }
143 | 
144 | table > thead > tr > th {
145 | 	text-align: left;
146 | 	border-bottom: 1px solid;
147 | }
148 | 
149 | table > thead > tr > th,
150 | table > thead > tr > td,
151 | table > tbody > tr > th,
152 | table > tbody > tr > td {
153 | 	padding: 5px 10px;
154 | }
155 | 
156 | table > tbody > tr + tr > td {
157 | 	border-top: 1px solid;
158 | }
159 | 
160 | blockquote {
161 | 	margin: 0 7px 0 5px;
162 | 	padding: 0 16px 0 10px;
163 | 	border-left-width: 5px;
164 | 	border-left-style: solid;
165 | }
166 | 
167 | code {
168 | 	font-family: Menlo, Monaco, Consolas, "Droid Sans Mono", "Courier New", monospace, "Droid Sans Fallback";
169 | 	font-size: 1em;
170 | 	line-height: 1.357em;
171 | }
172 | 
173 | body.wordWrap pre {
174 | 	white-space: pre-wrap;
175 | }
176 | 
177 | pre:not(.hljs),
178 | pre.hljs code > div {
179 | 	padding: 16px;
180 | 	border-radius: 3px;
181 | 	overflow: auto;
182 | }
183 | 
184 | pre code {
185 | 	color: var(--vscode-editor-foreground);
186 | 	tab-size: 4;
187 | }
188 | 
189 | /** Theming */
190 | 
191 | .vscode-light pre {
192 | 	background-color: rgba(220, 220, 220, 0.4);
193 | }
194 | 
195 | .vscode-dark pre {
196 | 	background-color: rgba(10, 10, 10, 0.4);
197 | }
198 | 
199 | .vscode-high-contrast pre {
200 | 	background-color: rgb(0, 0, 0);
201 | }
202 | 
203 | .vscode-high-contrast h1 {
204 | 	border-color: rgb(0, 0, 0);
205 | }
206 | 
207 | .vscode-light table > thead > tr > th {
208 | 	border-color: rgba(0, 0, 0, 0.69);
209 | }
210 | 
211 | .vscode-dark table > thead > tr > th {
212 | 	border-color: rgba(255, 255, 255, 0.69);
213 | }
214 | 
215 | .vscode-light h1,
216 | .vscode-light hr,
217 | .vscode-light table > tbody > tr + tr > td {
218 | 	border-color: rgba(0, 0, 0, 0.18);
219 | }
220 | 
221 | .vscode-dark h1,
222 | .vscode-dark hr,
223 | .vscode-dark table > tbody > tr + tr > td {
224 | 	border-color: rgba(255, 255, 255, 0.18);
225 | }
226 | 
227 | </style>
228 | 
229 | <style>
230 | /* Tomorrow Theme */
231 | /* http://jmblog.github.com/color-themes-for-google-code-highlightjs */
232 | /* Original theme - https://github.com/chriskempson/tomorrow-theme */
233 | 
234 | /* Tomorrow Comment */
235 | .hljs-comment,
236 | .hljs-quote {
237 | 	color: #8e908c;
238 | }
239 | 
240 | /* Tomorrow Red */
241 | .hljs-variable,
242 | .hljs-template-variable,
243 | .hljs-tag,
244 | .hljs-name,
245 | .hljs-selector-id,
246 | .hljs-selector-class,
247 | .hljs-regexp,
248 | .hljs-deletion {
249 | 	color: #c82829;
250 | }
251 | 
252 | /* Tomorrow Orange */
253 | .hljs-number,
254 | .hljs-built_in,
255 | .hljs-builtin-name,
256 | .hljs-literal,
257 | .hljs-type,
258 | .hljs-params,
259 | .hljs-meta,
260 | .hljs-link {
261 | 	color: #f5871f;
262 | }
263 | 
264 | /* Tomorrow Yellow */
265 | .hljs-attribute {
266 | 	color: #eab700;
267 | }
268 | 
269 | /* Tomorrow Green */
270 | .hljs-string,
271 | .hljs-symbol,
272 | .hljs-bullet,
273 | .hljs-addition {
274 | 	color: #718c00;
275 | }
276 | 
277 | /* Tomorrow Blue */
278 | .hljs-title,
279 | .hljs-section {
280 | 	color: #4271ae;
281 | }
282 | 
283 | /* Tomorrow Purple */
284 | .hljs-keyword,
285 | .hljs-selector-tag {
286 | 	color: #8959a8;
287 | }
288 | 
289 | .hljs {
290 | 	display: block;
291 | 	overflow-x: auto;
292 | 	color: #4d4d4c;
293 | 	padding: 0.5em;
294 | }
295 | 
296 | .hljs-emphasis {
297 | 	font-style: italic;
298 | }
299 | 
300 | .hljs-strong {
301 | 	font-weight: bold;
302 | }
303 | </style>
304 | 
305 | <style>
306 | /*
307 |  * Markdown PDF CSS
308 |  */
309 | 
310 |  body {
311 | 	font-family: -apple-system, BlinkMacSystemFont, "Segoe WPC", "Segoe UI", "Ubuntu", "Droid Sans", sans-serif, "Meiryo";
312 | 	padding: 0 12px;
313 | }
314 | 
315 | pre {
316 | 	background-color: #f8f8f8;
317 | 	border: 1px solid #cccccc;
318 | 	border-radius: 3px;
319 | 	overflow-x: auto;
320 | 	white-space: pre-wrap;
321 | 	overflow-wrap: break-word;
322 | }
323 | 
324 | pre:not(.hljs) {
325 | 	padding: 23px;
326 | 	line-height: 19px;
327 | }
328 | 
329 | blockquote {
330 | 	background: rgba(127, 127, 127, 0.1);
331 | 	border-color: rgba(0, 122, 204, 0.5);
332 | }
333 | 
334 | .emoji {
335 | 	height: 1.4em;
336 | }
337 | 
338 | code {
339 | 	font-size: 14px;
340 | 	line-height: 19px;
341 | }
342 | 
343 | /* for inline code */
344 | :not(pre):not(.hljs) > code {
345 | 	color: #C9AE75; /* Change the old color so it seems less like an error */
346 | 	font-size: inherit;
347 | }
348 | 
349 | /* Page Break : use <div class="page"/> to insert page break
350 | -------------------------------------------------------- */
351 | .page {
352 | 	page-break-after: always;
353 | }
354 | 
355 | </style>
356 | 
357 | <style>
358 | .collapsible {
359 |   background-color: #777;
360 |   color: white;
361 |   cursor: pointer;
362 |   padding: 18px;
363 |   width: 100%;
364 |   border: none;
365 |   text-align: left;
366 |   outline: none;
367 |   font-size: 15px;
368 | }
369 | 
370 | .active, .collapsible:hover {
371 |   background-color: #555;
372 | }
373 | 
374 | .collapsible:after {
375 |   content: '\002B';
376 |   color: white;
377 |   font-weight: bold;
378 |   float: right;
379 |   margin-left: 5px;
380 | }
381 | 
382 | .active:after {
383 |   content: "\2212";
384 | }
385 | 
386 | .content {
387 |   padding: 0 18px;
388 |   max-height: 0;
389 |   overflow: hidden;
390 |   transition: max-height 0.2s ease-out;
391 |   background-color: #f1f1f1;
392 | }
393 | </style>
394 | 
395 | <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.2.0/chart.umd.js"></script>
396 | 
397 | </head>
398 | <body>
399 | <h1>Report</h1>
400 | 
401 | <h2>Runtime stats</h2>
402 | 
403 | <div style="width: 800px;"><canvas id="myChart"></canvas></div>
404 | 
405 | <script>
406 | const xValues = REPLACE_WITH_XVALUES;
407 | const yValues_a = REPLACE_WITH_YVALUES_A;
408 | const yValues_b = REPLACE_WITH_YVALUES_B;
409 | 
410 | const data = {
411 |   labels: xValues,
412 |   datasets: [
413 |     {
414 |       label: 'Known anomaly pattern',
415 |       data: yValues_a,
416 |       backgroundColor: "MediumSeaGreen",
417 |       stack: 'Stack 0',
418 |     },
419 |     {
420 |       label: 'Unknown anomaly pattern',
421 |       data: yValues_b,
422 |       backgroundColor: "Orange",
423 |       stack: 'Stack 0',
424 |     },
425 |   ]
426 | };
427 | 
428 | const config = {
429 |   type: 'bar',
430 |   data: data,
431 |   options: {
432 |     plugins: {
433 |       title: {
434 |         display: true,
435 |         text: 'Monthly anomalies'
436 |       },
437 |     },
438 |     responsive: true,
439 |     interaction: {
440 |       intersect: false,
441 |     },
442 |     scales: {
443 |       x: {
444 |         stacked: true,
445 |       },
446 |       y: {
447 |         stacked: true
448 |       }
449 |     }
450 |   }
451 | };
452 | 
453 | const myChart = new Chart("myChart", config);
454 | </script>
455 | 
456 | REPLACE_WITH_EXPLANATION
457 | 
458 | <h2>All alarms</h2>
459 | 
460 | REPLACE_WITH_SECTIONS
461 | 
462 | <script>
463 | var coll = document.getElementsByClassName("collapsible");
464 | var i;
465 | 
466 | for (i = 0; i < coll.length; i++) {
467 |   coll[i].addEventListener("click", function() {
468 |     this.classList.toggle("active");
469 |     var content = this.nextElementSibling;
470 |     if (content.style.maxHeight){
471 |       content.style.maxHeight = null;
472 |     } else {
473 |       content.style.maxHeight = content.scrollHeight + "px";
474 |     }
475 |   });
476 | }
477 | </script>
478 | 
479 | </body>
480 | </html>
481 | 


--------------------------------------------------------------------------------
/post_processor/alarm_postprocess_routeviews.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | from pathlib import Path
  5 | import pandas as pd
  6 | import numpy as np
  7 | import json
  8 | import click
  9 | from functools import lru_cache
 10 | from datetime import datetime, timedelta
 11 | 
 12 | from rpki_validator import RPKI
 13 | from irr_validator import RADB
 14 | from whois_lookup import whois_match
 15 | 
 16 | import sys
 17 | sys.path.append(str(Path(__file__).resolve().parent.parent))
 18 | from data.caida_as_org.fetch_data import get_most_recent as as_org_file
 19 | from data.caida_as_org.query import load as parse_as_org
 20 | from data.caida_as_rel.fetch_data import get as as_rel_file
 21 | 
 22 | def get_one_asn(asn):
 23 |     return asn.strip("{}").split(",")[0]
 24 | 
 25 | def get_recent_monday(date_str):
 26 |     date = datetime.strptime(date_str, "%Y%m%d").date()
 27 |     monday = date - timedelta(days=date.weekday())
 28 |     return monday
 29 | 
 30 | def load_as_org(time):
 31 |     time, fpath = as_org_file(time)
 32 |     as_info, org_info = parse_as_org(time)
 33 | 
 34 |     def get_org_id(asn):
 35 |         if asn not in as_info:
 36 |             return asn
 37 |         info = as_info[asn]
 38 |         return info["opaque_id"] if info["opaque_id"] != "" else info["org_id"]
 39 | 
 40 |     def from_same_org(asn1, asn2):
 41 |         if get_org_id(asn1) == get_org_id(asn2):
 42 |             return get_org_id(asn1)
 43 |         else:
 44 |             return "-"
 45 | 
 46 |     def get_asn_country(asn):
 47 |         org_id = get_org_id(asn)
 48 |         if org_id in org_info:
 49 |             return org_info[org_id]["country"]
 50 | 
 51 |     return as_info, org_info, from_same_org, get_asn_country
 52 | 
 53 | def load_as_rel(serial, time):
 54 |     target = as_rel_file(serial, time)
 55 |     as_rel_map = {}
 56 |     lines = open(target, "r").readlines()
 57 |     for l in lines:
 58 |         if l[0] == "#": continue
 59 |         as1, as2, rel = l.split("|")[:3]
 60 |         rel = int(rel)
 61 |         as_rel_map.setdefault(as1, {-1:set(), 0:set(), 1:set()})[+rel].add(as2)
 62 |         as_rel_map.setdefault(as2, {-1:set(), 0:set(), 1:set()})[-rel].add(as1)
 63 | 
 64 |     def get_as_rel(as1, as2):
 65 |         if as1 in as_rel_map:
 66 |             for rel, as_set in as_rel_map[as1].items():
 67 |                 if as2 in as_set: return rel
 68 |         return None
 69 | 
 70 |     def get_all_ngbrs(asn):
 71 |         if asn not in as_rel_map: return None
 72 |         ret = set()
 73 |         for v in as_rel_map[asn].values():
 74 |             ret |= v
 75 |         return ret
 76 | 
 77 |     def have_connection(as1, as2):
 78 |         if as1 in as_rel_map and as2 in as_rel_map:
 79 |             for rel,ngbrs in as_rel_map[as1].items():
 80 |                 if as2 in ngbrs:
 81 |                     return f"rel({rel})"
 82 |             ret = []
 83 |             for rel in [-1, 0, 1]:
 84 |                 if as_rel_map[as1][rel]&as_rel_map[as2][rel]:
 85 |                     ret.append(f"{rel}")
 86 |             if ret:
 87 |                 return f"ngbr({';'.join(ret)})"
 88 |         return "-"
 89 | 
 90 |     return as_rel_map, get_as_rel, have_connection
 91 | 
 92 | def different_origin_country(path1, path2, get_asn_country):
 93 |     cty1 = get_asn_country(path1[-1])
 94 |     cty2 = get_asn_country(path2[-1])
 95 |     if cty1 != cty2:
 96 |         return f"{cty1};{cty2}"
 97 |     else:
 98 |         return "-"
 99 | 
100 | def have_origin_connection(path1, path2, have_connection):
101 |     return have_connection(path1[-1], path2[-1])
102 | 
103 | def have_unknown_asn(path, as_rel_map):
104 |     ret = []
105 |     for i in path:
106 |         if i not in as_rel_map:
107 |             ret.append(str(i))
108 |     if ret:
109 |         return ";".join(set(ret))
110 |     return "-"
111 | 
112 | def have_reserved_asn(path):
113 |     # reserved ASN (last updated: 2024-04-10)
114 |     # https://www.iana.org/assignments/as-numbers/as-numbers.xhtml
115 |     ret = []
116 |     for i in path:
117 |         i = int(i)
118 |         if (i == 0 or i == 112 or i == 23456 or
119 |            (i >= 64496  and i <= 131071) or
120 |            (i >= 153914 and i <= 196607) or
121 |            (i >= 216476 and i <= 262143) or
122 |            (i >= 274845 and i <= 327679) or
123 |            (i >= 329728 and i <= 393215) or
124 |             i >= 402333):
125 |             ret.append(str(i))
126 |     if ret:
127 |         return ";".join(set(ret))
128 |     return "-"
129 | 
130 | def non_valley_free_or_none_rel(path, get_as_rel):
131 |     none_rel = []
132 |     non_valley_free = False
133 |     rel_seq = []
134 |     state = 1
135 |     for a,b in zip(path[:-1], path[1:]):
136 |         if a == b:
137 |             rel_seq.append("x")
138 |             continue
139 |         r = get_as_rel(a, b)
140 |         if r is None:
141 |             rel_seq.append("x")
142 |             none_rel.append(f"({a} {b})")
143 |             continue
144 |         rel_seq.append(str(r))
145 |         if state == 1:
146 |             state = r
147 |             continue
148 |         if r != -1:
149 |             non_valley_free = True
150 | 
151 |     if non_valley_free:
152 |         non_valley_free = " ".join(rel_seq)
153 |     else:
154 |         non_valley_free = "-"
155 | 
156 |     if none_rel:
157 |         none_rel = ";".join(set(none_rel))
158 |     else:
159 |         none_rel = "-"
160 | 
161 |     return non_valley_free, none_rel
162 | 
163 | def detour_country(path1, path2, get_asn_country):
164 |     countries0 = set(filter(lambda x: x is not None,
165 |                         map(get_asn_country, path1[:-1])))
166 |     countries1 = set(filter(lambda x: x is not None,
167 |                         map(get_asn_country, path2[:-1])))
168 |     detour_new_country = len(countries1-countries0) > 0
169 |     return detour_new_country
170 | 
171 | def as_prepend(path):
172 |     ret = []
173 |     for asn, cnt in zip(*np.unique(path, return_counts=True)):
174 |         if cnt > 1: ret.append(asn)
175 |     if ret:
176 |         return ";".join(ret)
177 |     return "-"
178 | 
179 | def origin_different_upstream(path1, path2, get_as_rel):
180 |     path1 = [v for i,v in enumerate(path1) if i == 0 or v != path1[i-1]]
181 |     path2 = [v for i,v in enumerate(path2) if i == 0 or v != path2[i-1]]
182 |     if len(path1) >= 2 and len(path2) >= 2 \
183 |             and path1[-1] == path2[-1] \
184 |             and path1[-2] != path2[-2] \
185 |             and get_as_rel(path1[-2], path1[-1]) is not None \
186 |             and get_as_rel(path2[-2], path2[-1]) is not None:
187 |         return f"{path1[-2]};{path2[-2]}"
188 |     return "-"
189 | 
190 | def origin_rpki_valid(rpki, prefix, path):
191 |     return rpki.validate(prefix, path[-1])
192 | 
193 | def origin_irr_valid(radb, prefix, path):
194 |     return radb.validate(prefix, path[-1])
195 | 
196 | def origin_whois_match(prefix, path):
197 |     return whois_match(prefix, path[-1])
198 | 
199 | def path_superset(path1, path2):
200 |     return ",".join(path1) in ",".join(path2)
201 | 
202 | @lru_cache(maxsize=10)
203 | def _get_rpki(date):
204 |     return RPKI().load_data(date.year, date.month, date.day)
205 | 
206 | def get_rpki(date):
207 |     return _get_rpki(get_recent_monday(date))
208 | 
209 | @lru_cache(maxsize=10)
210 | def _get_radb(date):
211 |     return RADB().load_data(date.year, date.month, date.day)
212 | 
213 | def get_radb(date):
214 |     return _get_radb(get_recent_monday(date))
215 | 
216 | @click.command()
217 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to postprocess the detection results")
218 | @click.option("--year", "-y", type=int, required=True, help="the year of the detection results, e.g., 2024")
219 | @click.option("--month", "-m", type=int, required=True, help="the month of the detection results, e.g., 8")
220 | def postprocess(collector, year, month):
221 |     as_info, org_info, from_same_org, get_asn_country = load_as_org(f"{year}{month:02d}01")
222 |     as_rel_map, get_as_rel, have_connection = load_as_rel("1", f"{year}{month:02d}01")
223 | 
224 |     repo_dir = Path(__file__).resolve().parent.parent
225 |     collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector
226 |     reported_alarm_dir = collector_result_dir/"reported_alarms"/f"{year}{month:02d}"
227 |     info = json.load(open(reported_alarm_dir/f"info_{year}{month:02d}.json", "r"))
228 |     flags_dir = reported_alarm_dir.parent/f"{year}{month:02d}.flags"
229 |     flags_dir.mkdir(parents=True, exist_ok=True)
230 | 
231 |     for i in info:
232 |         if i["save_path"] is None: continue
233 |         df = pd.read_csv(i["save_path"])
234 |         prefix1 = df["prefix1"].values
235 |         prefix2 = df["prefix2"].values
236 |         path1 = [list(map(get_one_asn, i.split(" "))) for i in df["path1"].values]
237 |         path2 = [list(map(get_one_asn, i.split(" "))) for i in df["path2"].values]
238 | 
239 |         non_valley_free_1, none_rel_1 = np.array(list(map(lambda x: non_valley_free_or_none_rel(x, get_as_rel), path1))).T
240 |         non_valley_free_2, none_rel_2 = np.array(list(map(lambda x: non_valley_free_or_none_rel(x, get_as_rel), path2))).T
241 | 
242 |         rpki = get_rpki(i["d0"][:8])
243 |         radb = get_radb(i["d0"][:8])
244 | 
245 |         flags = pd.DataFrame.from_dict({
246 |             "subprefix_change": [p1 != p2 for p1, p2 in zip(prefix1, prefix2)],
247 |             "origin_change": [l1[-1] != l2[-1] for l1, l2 in zip(path1, path2)],
248 |             "origin_same_org": [from_same_org(l1[-1], l2[-1]) for l1, l2 in zip(path1, path2)],
249 |             "origin_country_change": [different_origin_country(l1, l2, get_asn_country) for l1, l2 in zip(path1, path2)],
250 |             "origin_connection": [have_origin_connection(l1, l2, have_connection) for l1, l2 in zip(path1, path2)],
251 |             "origin_different_upstream": [origin_different_upstream(l1, l2, get_as_rel) for l1, l2 in zip(path1, path2)],
252 |             "origin_rpki_1": [origin_rpki_valid(rpki, p, l) for p, l in zip(prefix1, path1)],
253 |             "origin_rpki_2": [origin_rpki_valid(rpki, p, l) for p, l in zip(prefix2, path2)],
254 |             "origin_irr_1": [origin_irr_valid(radb, p, l) for p, l in zip(prefix1, path1)],
255 |             "origin_irr_2": [origin_irr_valid(radb, p, l) for p, l in zip(prefix2, path2)],
256 |             "origin_whois_1": [origin_whois_match(p, l) for p, l in zip(prefix1, path1)],
257 |             "origin_whois_2": [origin_whois_match(p, l) for p, l in zip(prefix2, path2)],
258 |             "unknown_asn_1": [have_unknown_asn(l, as_rel_map) for l in path1],
259 |             "unknown_asn_2": [have_unknown_asn(l, as_rel_map) for l in path2],
260 |             "reserved_path_1": [have_reserved_asn(l) for l in path1],
261 |             "reserved_path_2": [have_reserved_asn(l) for l in path2],
262 |             "non_valley_free_1": non_valley_free_1,
263 |             "non_valley_free_2": non_valley_free_2,
264 |             "none_rel_1": none_rel_1,
265 |             "none_rel_2": none_rel_2,
266 |             "as_prepend_1": [as_prepend(l) for l in path1],
267 |             "as_prepend_2": [as_prepend(l) for l in path2],
268 |             "detour_country": [detour_country(l1, l2, get_asn_country) for l1, l2 in zip(path1, path2)],
269 |             "path1_in_path2": [path_superset(l1, l2) for l1, l2 in zip(path1, path2)],
270 |             "path2_in_path1": [path_superset(l2, l1) for l1, l2 in zip(path1, path2)],
271 |         })
272 |         flags.to_csv(flags_dir/f"{Path(i['save_path']).stem}.flags.csv", index=False)
273 | 
274 | if __name__ == "__main__":
275 |     postprocess()
276 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Semantics-Aware Routing Anomaly Detection System
  2 | 
  3 | A demonstration codebase for the routing anomaly detection system featured in the USENIX Security 2024 [paper](https://www.usenix.org/conference/usenixsecurity24/presentation/chen-yihao), _Learning with Semantics: Towards a Semantics-Aware Routing Anomaly Detection System_, this repo is intended for research and demonstration purposes and may not be ready for production use. A full-featured, open-source version of the system, presumably including a public service, is currently under development; please see the [Future Work](#future-work) section for more information.
  4 | 
  5 | Contact: yh-chen21@mails.tsinghua.edu.cn
  6 | 
  7 | ## System Overview
  8 | 
  9 | <img src="doc/detection-prototype.png">
 10 | 
 11 | The system consists of three main modules:
 12 | 
 13 | -   **BEAM Engine** (`BEAM_engine/`): Uses AS business relationship data as input to train the BEAM model, which is used to quantify the path difference (abnormality) of route changes.
 14 | 
 15 | -   **Routing Monitor** (`routing_monitor/`): Takes BGP update announcements as input and outputs detected route changes.
 16 | 
 17 | -   **Anomaly Detector** (`anomaly_detector/`): Performs anomaly detection on the route changes and conducts correlation analysis on detected anomalous routing changes, outputting anomaly alarms.
 18 | 
 19 | A post-processing module (`post_processor/`) is additionally introduced for anomaly inspection and well-formatted HTML reports.
 20 | 
 21 | ## Workflow
 22 | 
 23 | A typical workflow with this codebase is as follows:
 24 | 
 25 | 1.  Train the BEAM model.
 26 | 2.  Detect route changes from a window of routing data.
 27 | 3.  Use the BEAM model to quantify the path difference of the route changes.
 28 | 4.  Identify those with abnormally high path difference, aggregate them, and raise alarms.
 29 | 5.  Generate a formatted anomaly report.
 30 | 
 31 | ## Get Started
 32 | 
 33 | ### 0. Prepare the environment
 34 | 
 35 | -   Python (>=3.8) is required, along with necessary packages. GPU and CUDA is recommended for model training.
 36 | 
 37 |     Set it up using Anaconda or Miniconda as follows:
 38 | 
 39 |     ```bash
 40 |     conda create -n beam python=3.8 numpy pandas scipy tqdm joblib click pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
 41 |     conda activate beam
 42 |     ```
 43 | 
 44 | -   The [BGPdump tool](https://github.com/RIPE-NCC/bgpdump) is required for parsing MRT-format BGP routing data.
 45 | 
 46 |     Build it from source, and link the binary to `$YOUR_REPO_PATH/data/routeviews/bgpd`, as follows:
 47 | 
 48 |     ```bash
 49 |     git clone https://github.com/RIPE-NCC/bgpdump.git
 50 |     cd bgpdump
 51 |     sh ./bootstrap.sh
 52 |     make
 53 |     ln -s $(pwd)/bgpdump $YOUR_REPO_PATH/data/routeviews/bgpd
 54 |     $YOUR_REPO_PATH/data/routeviews/bgpd -T # should print the test output
 55 |     ```
 56 | 
 57 | ### 1. Train the BEAM model
 58 | 
 59 | Run `BEAM_engine/train.py` for model training. See all available parameters with `--help`.
 60 | 
 61 | An example run is as follows:
 62 | 
 63 | ```bash
 64 | python train.py --serial 2 \
 65 |                 --time 20240801 \
 66 |                 --Q 10 \
 67 |                 --dimension 128 \
 68 |                 --epoches 1000 \
 69 |                 --device 0 \
 70 |                 --num-workers 10
 71 | ```
 72 | 
 73 | This example downloads the [CAIDA AS relationship data](https://publicdata.caida.org/datasets/as-relationships/) of serial-2, Aug. 1, 2024, to train a BEAM model for 1000 epoches, with the number of negative samples (`Q`) set to 10 and the embedding vector dimension (`d`) set to 128. Training is executed on device 0 (either CPU or GPU, depending on the machine), and up to 10 parallel workers are used for data processing.
 74 | 
 75 | Notes:
 76 | 
 77 | -   The CAIDA AS relationship data is updated monthly. A typical archive today contains approximately 500,000 AS relationship records and is around 1.5MB in size.
 78 | 
 79 | -   The required CAIDA data is downloaded upon first use and stored in either `data/caida_as_rel/serial-1/` or `data/caida_as_rel/serial-2/`. Alternatively, other sources can be used if they follow the same format.
 80 | 
 81 | -   The trained model is saved in a directory under `BEAM_engine/models/`, named according to the training parameters. This includes the trained embedding vectors (`link.emb`, `node.emb`, `rela.emb`).
 82 | 
 83 | -   For reference, on a dual-core Xeon E5-2650v4 with a GeForce RTX 2080 Ti, training for 1000 epoches takes about 10 hours, with peak memory usage within 10GB.
 84 | 
 85 | ### 2. Detect route changes
 86 | 
 87 | Run `routing_monitor/detect_route_change_routeviews.py` for monthly route change detection. See all available parameters with `--help`.
 88 | 
 89 | An example run is as follows:
 90 | 
 91 | ```bash
 92 | python detect_route_change_routeviews.py \
 93 |             --collector wide \
 94 |             --year 2024 \
 95 |             --month 8
 96 | ```
 97 | 
 98 | This example downloads and identifies route changes with the BGP update announcements from the `wide` collector of [RouteViews](http://routeviews.org/), for the entire month of August, 2024.
 99 | 
100 | Notes:
101 | 
102 | -   RouteViews maintains over 30 collectors, each of which archives BGP update announcements in [MRT format](https://www.rfc-editor.org/rfc/rfc6396) at approximately 15-minute intervals. BGPdump is called as a subprocess to load these data. Other sources of data can also be used if they adhere to the MRT format.
103 | 
104 | -   The required RouteViews data is downloaded upon first use and stored in a directory under `data/routeviews/updates/`, named after the chosen collector.
105 | 
106 | -   This script processes the routing data of an entire month sequentially, in an offline manner. A global routing table is maintained in a Trie structure to track the route changes. The results are stored in a directory under `routing_monitor/detection_result/`, named after the chosen collector. The results include the identified route changes and daily snapshots of the global routing table.
107 | 
108 | ### 3. Quantify path difference
109 | 
110 | Run `anomaly_detector/BEAM_diff_evaluator_routeviews.py` for path difference evaluation on the monthly route changes. See all available parameters with `--help`.
111 | 
112 | An example run is as follows:
113 | 
114 | ```bash
115 | python BEAM_diff_evaluator_routeviews.py \
116 |             --collector wide \
117 |             --year 2024 \
118 |             --month 8 \
119 |             --beam-model 20240801.as-rel2.1000.10.128
120 | ```
121 | 
122 | This example uses the BEAM model trained in [Step 1](#1-train-the-beam-model) to evaluate the path difference of the route changes detected in [Step 2](#2-detect-route-changes).
123 | 
124 | Notes:
125 | 
126 | -   This script evaluates the path difference for route changes of an entire month sequentially, in an offline manner. The results are stored in `BEAM_metric/`, under the same parent directory as the route change directory of the chosen collector.
127 | 
128 | ### 4. Detect anomalies
129 | 
130 | Run `anomaly_detector/report_anomaly_routeviews.py` to detect anomalies based on the path difference of route changes. See all available parameters with `--help`.
131 | 
132 | An example run is as follows:
133 | 
134 | ```bash
135 | python report_anomaly_routeviews.py \
136 |             --collector wide \
137 |             --year 2024 \
138 |             --month 8
139 | ```
140 | 
141 | This example detects anomalies based on the route changes detected in [Step 2](#2-detect-route-changes) and their path difference evaluated in [Step 3](#3-quantify-path-difference).
142 | 
143 | Notes:
144 | 
145 | -   This script detects anomalies for route changes of an entire month sequentially, in an offline manner. The results are stored in `reported_alarms/`, under the same parent directory as the route change directory of the chosen collector.
146 | 
147 | -   The results include the anomaly alarms raised for each time window, in separate CSV files, as well as a JSON file describing the overall information of the month's detection. Each alarm contains the time window, prefixes, associated ASes, and corresponding anomalous route changes, all associated to a single anomaly.
148 | 
149 | ### 5. Generate the report
150 | 
151 | Run `post_processor/alarm_postprocess_routeviews.py` to incorporate additional knowledge, e.g., RPKI states, for identifying properties associated with the generated alarms. See all available parameters with `--help`.
152 | 
153 | An example run is as follows:
154 | 
155 | ```bash
156 | python alarm_postprocess_routeviews.py \
157 |             --collector wide \
158 |             --year 2024 \
159 |             --month 8
160 | ```
161 | 
162 | This example identifies properties associated with the alarms generated in [Step 4](#4-detect-anomalies).
163 | 
164 | Notes:
165 | 
166 | -   This script utilizes additional knowledge to identify several properties associated with the alarms, for better understanding of the anomalies. The results are stored in `reported_alarms.flags/`, under the same parent directory as the route change directory of the chosen collector.
167 | 
168 | -   Each alarm would be associated with the following properties:
169 |     -   `subprefix_change`: the alarm includes route changes involving sub-prefixes.
170 |     -   `origin_change`: the alarm includes route changes involving change of origin ASes.
171 |     -   `origin_same_org`: the alarm includes origin changes where the different origin ASes are from the same organization.
172 |     -   `origin_country_change`: the alarm includes origin changes where the different origin ASes are from different countries.
173 |     -   `origin_connection`: the alarm includes origin changes where the different origin ASes are connected.
174 |     -   `origin_different_upstream`: the alarm includes route changes where the path go through different upstream providers from the same origin.
175 |     -   `origin_rpki_1`: the origin before the change is RPKI-valid.
176 |     -   `origin_rpki_2`: the origin after the change is RPKI-valid.
177 |     -   `unknown_asn_1`: the route before the change includes unknown ASN.
178 |     -   `unknown_asn_2`: the route after the change includes unknown ASN.
179 |     -   `reserved_path_1`: the route before the change includes reserved ASN.
180 |     -   `reserved_path_2`: the route after the change includes reserved ASN.
181 |     -   `non_valley_free_1`: the route before the change is non-valley-free.
182 |     -   `non_valley_free_2`: the route after the change is non-valley-free.
183 |     -   `none_rel_1`: the route before the change includes unknown links.
184 |     -   `none_rel_2`: the route after the change includes unknown links.
185 |     -   `as_prepend_1`: the route before the change includes prepended ASes.
186 |     -   `as_prepend_2`: the route after the change includes prepended ASes.
187 |     -   `detour_country`: the alarm includes route detouring through other countries.
188 |     -   `path1_in_path2`: the route before the change is the subset of that after the change.
189 |     -   `path2_in_path1`: the route after the change is the subset of that before the change.
190 | 
191 | After the properties are associated, run `post_processor/summary_routeviews.py` to generate an HTML report about the month's detection results. See all available parameters with `--help`.
192 | 
193 | An example run is as follows:
194 | 
195 | ```bash
196 | python summary_routeviews.py \
197 |             --collector wide \
198 |             --year 2024 \
199 |             --month 8
200 | ```
201 | 
202 | This example will generate an HTML report and a JSON-line-format file from the alarms generated in [Step 4](#4-detect-anomalies).
203 | 
204 | Notes:
205 | 
206 | -   The HTML report is stored in `post_processor/html/`, and the JSON-line-format file is stored in `post_processor/summary_output/`.
207 | 
208 | -   The HTML report is self-contained, with necessary descriptions of the terms used.
209 | 
210 | ## Future Work
211 | 
212 | **Updated on Sep. 13, 2024**:
213 | 
214 | A full-featured, open-source version of the anomaly detection system is under development, aimed at deployment in production environments such as ISPs, and potentially as a public service to monitor the Internet and issue BGP anomaly warnings. This includes plans to refactor key functions using the Rust programming language and package them as Crates for public access. The current organization of these components is as follows:
215 | 
216 | -   **In progress:** A module to synchronize routing data in real time from RouteViews, RIPE RIS, and self-operated or peering ASes, stored locally in a database.
217 |     -   SQLite for local storage and management of routing data.
218 |     -   Part of functions from BGPstream.
219 |     -   Integration of BGPdump.
220 |     -   KVM/Docker support for virtual routers.
221 | -   **Pending:** A module to train BEAM models using the latest CAIDA AS relationship data.
222 | -   **Pending:** A module to process real-time routing data and detect anomalies.
223 | -   **Pending:** A website or app for displaying and analyzing detection results in real time.
224 | 
225 | ---
226 | 


--------------------------------------------------------------------------------
/post_processor/summary_routeviews.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | import json
  5 | import pandas as pd
  6 | import numpy as np
  7 | from pathlib import Path
  8 | from datetime import datetime
  9 | import subprocess 
 10 | import calendar
 11 | import click
 12 | 
 13 | import sys
 14 | sys.path.append(str(Path(__file__).resolve().parent.parent))
 15 | from anomaly_detector.utils import event_aggregate
 16 | 
 17 | @click.command()
 18 | @click.option("--collector", "-c", type=str, default="wide", help="the name of RouteView collector to generate the report")
 19 | @click.option("--year", "-y", type=int, required=True, help="the year of the detection results, e.g., 2024")
 20 | @click.option("--month", "-m", type=int, required=True, help="the month of the detection results, e.g., 8")
 21 | def main(collector, year, month):
 22 |     repo_dir = Path(__file__).resolve().parent.parent
 23 |     collector_result_dir = repo_dir/"routing_monitor"/"detection_result"/collector
 24 |     reported_alarm_dir = collector_result_dir/"reported_alarms"/f"{year}{month:02d}"
 25 |     route_change_dir = collector_result_dir/"route_change"
 26 |     info = json.load(open(reported_alarm_dir/f"info_{year}{month:02}.json", "r"))
 27 |     flags_dir = reported_alarm_dir.parent/f"{year}{month:02d}.flags"
 28 | 
 29 |     summary_dir = Path(__file__).resolve().parent/"summary_output"
 30 |     summary_dir.mkdir(parents=True, exist_ok=True)
 31 | 
 32 |     html_dir = Path(__file__).resolve().parent/"html"
 33 | 
 34 |     metric = "diff_balance"
 35 | 
 36 |     def has_flag(v):
 37 |         return v != "-"
 38 |     v_flag = np.vectorize(has_flag)
 39 | 
 40 |     def invalid_asn(v):
 41 |         return v == "Invalid"
 42 |     v_invalid_asn = np.vectorize(invalid_asn)
 43 | 
 44 |     def invalid_len(v):
 45 |         return v == "invalid_length"
 46 |     v_invalid_len = np.vectorize(invalid_len)
 47 | 
 48 |     def valid(v):
 49 |         return v == "valid"
 50 |     v_valid = np.vectorize(valid)
 51 | 
 52 |     def summary():
 53 |         global_group_id = 0
 54 |         dfs = []
 55 |         for i in info:
 56 |             if i["save_path"] is None: continue
 57 |             df = pd.read_csv(i["save_path"])
 58 |             flags = pd.read_csv(flags_dir/f"{Path(i['save_path']).stem}.flags.csv")
 59 |             df = pd.concat([df, flags], axis=1)
 60 | 
 61 |             # highly possible origin hijack
 62 |             anomaly_t1 = df["origin_change"] \
 63 |                             & (~v_flag(df["origin_same_org"])) \
 64 |                             & ((v_invalid_asn(df["origin_rpki_1"])
 65 |                                 ^ v_invalid_asn(df["origin_rpki_2"]))
 66 |                                | (v_invalid_asn(df["origin_irr_1"])
 67 |                                 ^ v_invalid_asn(df["origin_irr_2"]))
 68 |                                | (df["origin_whois_1"] ^ df["origin_whois_2"]))
 69 | 
 70 |             # highly possible route leak
 71 |             anomaly_t2 = v_flag(df["non_valley_free_1"]) \
 72 |                             | v_flag(df["non_valley_free_2"])
 73 | 
 74 |             # highly possible path manipulation
 75 |             anomaly_t3 = v_flag(df["reserved_path_1"]) \
 76 |                             | v_flag(df["reserved_path_2"]) \
 77 |                             | v_flag(df["none_rel_1"]) \
 78 |                             | v_flag(df["none_rel_2"]) \
 79 |                             | np.isinf(df["diff"])
 80 |             exception_t3 = (~v_flag(df["reserved_path_1"])) \
 81 |                             & (~v_flag(df["reserved_path_2"])) \
 82 |                             & (v_flag(df["none_rel_1"]) \
 83 |                                 | v_flag(df["none_rel_2"]))
 84 | 
 85 |             # highly possible ROA/IRR/WHOIS misconfiguration
 86 |             anomaly_t4 = v_flag(df["origin_same_org"]) \
 87 |                             & ((v_invalid_asn(df["origin_rpki_1"])
 88 |                                 ^ v_invalid_asn(df["origin_rpki_2"]))
 89 |                                | (v_invalid_asn(df["origin_irr_1"])
 90 |                                 ^ v_invalid_asn(df["origin_irr_2"]))
 91 |                                | (df["origin_whois_1"] ^ df["origin_whois_2"]))
 92 | 
 93 |             # highly possible benign MOAS
 94 |             benign_t1 = df["origin_change"] \
 95 |                             & (v_flag(df["origin_same_org"]) 
 96 |                                 | v_flag(df["origin_connection"])
 97 |                                 | (v_valid(df["origin_rpki_1"])
 98 |                                     & v_valid(df["origin_rpki_2"])))
 99 | 
100 |             # highly possible AS prepending
101 |             benign_t2 = (~df["origin_change"]) \
102 |                             & (has_flag(df["as_prepend_1"])
103 |                                 ^ has_flag(df["as_prepend_2"]))
104 | 
105 |             # highly possible multi-homing
106 |             benign_t3 = v_flag(df["origin_different_upstream"])
107 | 
108 |             # no any sign of anomaly
109 |             benign_t4 = (~df["detour_country"]) \
110 |                             & v_valid(df["origin_rpki_1"]) \
111 |                             & v_valid(df["origin_rpki_2"])
112 | 
113 |             # possible false alarms due to the nature of diff computation
114 |             benign_t5 = (df["path_l1"]+df["path_l2"])/2 <= 3
115 | 
116 |             # possible prefix transfer
117 |             benign_t6 = df["path1_in_path2"]
118 | 
119 | 
120 |             df["a1"] = anomaly_t1
121 |             df["a2"] = anomaly_t2
122 |             df["a3"] = anomaly_t3
123 |             df["a4"] = anomaly_t4
124 |             df["b1"] = benign_t1
125 |             df["b2"] = benign_t2
126 |             df["b3"] = benign_t3
127 |             df["b4"] = benign_t4
128 |             df["b5"] = benign_t5
129 |             df["b6"] = benign_t6
130 | 
131 |             anomaly = anomaly_t1 | anomaly_t2 | anomaly_t3 | anomaly_t4
132 |             benign = benign_t1 | benign_t2 | benign_t3 | benign_t4 | benign_t5 | benign_t6
133 | 
134 |             df["pattern"] = "unknown"
135 |             df.loc[benign, ["pattern"]] = "benign"
136 |             df.loc[anomaly, ["pattern"]] = "anomaly"
137 | 
138 |             df = df.loc[anomaly | (~benign)] # post-filtering
139 |             df = df.loc[(~exception_t3)|anomaly_t1|anomaly_t2|anomaly_t4]
140 | 
141 |             event_key = i["event_key"]
142 |             forwarder_th = i["forwarder_th"]
143 | 
144 |             events = {}
145 |             for key,ev in df.groupby(event_key): # re-grouping and filtering
146 |                 if ev.shape[0] <= forwarder_th: continue
147 |                 events[key] = ev
148 | 
149 |             if events:
150 |                 _, df = event_aggregate(events)
151 |                 n_alarms = len(df["group_id"].unique())
152 |                 assert np.max(df["group_id"]) == n_alarms-1, f"{np.max(df['group_id'])}, {n_alarms-1}"
153 |                 df["group_id"] += global_group_id
154 |                 global_group_id += n_alarms
155 |                 dfs.append(df)
156 | 
157 |         df = pd.concat(dfs)
158 |         df.to_csv(summary_dir/f"alarms_after_post_process_{collector}_{year}{month:02}.csv", index=False)
159 |         return df
160 | 
161 |     df = summary()
162 | 
163 |     def reason(tag, row):
164 |         if tag == "a1":
165 |             fields = ["origin_rpki_1", "origin_rpki_2", "origin_irr_1", "origin_irr_2", "origin_whois_1", "origin_whois_2"]
166 |         elif tag == "a2":
167 |             fields = ["non_valley_free_1", "non_valley_free_2"]
168 |         elif tag == "a3":
169 |             fields = ["reserved_path_1", "reserved_path_2",
170 |                         "none_rel_1", "none_rel_2", "unknown_asn_1", "unknown_asn_2"]
171 |         elif tag == "a4":
172 |             fields = ["origin_same_org", "origin_rpki_1", "origin_rpki_2", "origin_irr_1", "origin_irr_2", "origin_whois_1", "origin_whois_2"]
173 |         elif tag == "b1":
174 |             fields = ["origin_same_org", "origin_connection",
175 |                         "origin_rpki_1", "origin_rpki_2"]
176 |         elif tag == "b2":
177 |             fields = ["as_prepend_1", "as_prepend_2"]
178 |         elif tag == "b3":
179 |             fields = ["origin_different_upstream"]
180 |         elif tag == "b4":
181 |             fields = ["origin_rpki_1", "origin_rpki_2"]
182 |         elif tag == "b5":
183 |             fields = []
184 | 
185 |         r = {i: str(row[i]) for i in fields if has_flag(row[i])}
186 |         return r
187 | 
188 |     def terminal_checkout(group_id, group):
189 |         tags = ["a1", "a2", "a3", "a4", "b1", "b2", "b3", "b4", "b5"]
190 | 
191 |         print(f"alarm_id: {group_id}")
192 |         for prefix_key, ev in group.groupby(["prefix1", "prefix2"]):
193 |             print(f"* {' -> '.join(prefix_key)}")
194 |             for _, row in ev.iterrows():
195 |                 print(f"  path1: {row['path1']}")
196 |                 print(f"  path2: {row['path2']}")
197 |                 print(f"  diff={row[metric]}")
198 |                 print(f"  culprit={row['culprit']}")
199 |                 for k,v in zip(tags, row[tags]):
200 |                     if v:
201 |                         r = reason(k, row)
202 |                         print(f"{k}: ", end="")
203 |                         print(",".join([f"{x}={y}" for x,y in r.items()]))
204 |                 print()
205 |         input("..Enter to next")
206 | 
207 |     def json_checkout(group_id, group):
208 |         tags = ["a1", "a2", "a3", "a4", "b1", "b2", "b3", "b4", "b5"]
209 | 
210 |         timestamp = group["timestamp"].values
211 |         fmt = "%a %d %b %Y, %I:%M%p"
212 |         start_time = datetime.fromtimestamp(timestamp.min()).strftime(fmt)
213 |         end_time = datetime.fromtimestamp(timestamp.max()).strftime(fmt)
214 | 
215 |         events = []
216 |         for prefix_key, ev in group.groupby(["prefix1", "prefix2"]):
217 |             route_changes = []
218 |             for _, row in ev.iterrows():
219 |                 route_changes.append({
220 |                     "timestamp": int(row["timestamp"]),
221 |                     "path1": str(row["path1"]),
222 |                     "path2": str(row["path2"]),
223 |                     "diff": float(row[metric]),
224 |                     "culprit": json.loads(str(row['culprit'])),
225 |                     "patterns": {k: reason(k, row) for k,v in zip(tags, row[tags]) if v},
226 |                 })
227 | 
228 |             events.append({
229 |                 "prefix": prefix_key,
230 |                 "route_changes": route_changes
231 |             })
232 | 
233 |         ret = {
234 |             "group_id": group_id,
235 |             "start_time": start_time,
236 |             "end_time": end_time,
237 |             "events": events,
238 |         }
239 |         # print(json.dumps(ret, indent=2))
240 |         return ret
241 | 
242 |     def group_html_checkout(group_id, group):
243 |         tags = ["a1", "a2", "a3", "a4", "b1", "b2", "b3", "b4", "b5"]
244 | 
245 |         timestamp = group["timestamp"].values
246 |         fmt = "%Y/%m/%d %H:%M:%S"
247 |         start_time = datetime.fromtimestamp(timestamp.min()).strftime(fmt)
248 |         end_time = datetime.fromtimestamp(timestamp.max()).strftime(fmt)
249 | 
250 |         def text_color(s, color):
251 |             return f'<span style="color:{color};">{s}</span>'
252 | 
253 |         events = []
254 |         for prefix_key, ev in group.groupby(["prefix1", "prefix2"]):
255 |             route_changes = []
256 |             for _, row in ev.iterrows():
257 |                 timestamp = f"<p><b>timestamp:</b> {row['timestamp']}</p>"
258 |                 path1 = f"<p><b>path1:</b> {row['path1']}</p>"
259 |                 path2 = f"<p><b>path2:</b> {row['path2']}</p>"
260 |                 diff = f"<p><b>diff:</b> {row[metric]}</p>"
261 |                 culprit = f"<p><b>culprit:</b> {row['culprit']}</p>"
262 | 
263 |                 patterns = []
264 |                 for k,v in zip(tags, row[tags]):
265 |                     if v:
266 |                         r = reason(k, row)
267 |                         p = f"<p>{k}: "+",".join([f"{x}={y}" for x,y in r.items()])+"</p>"
268 |                         patterns.append(p)
269 |                 pattern_part = "<p><b>patterns:</b> "
270 |                 if not patterns:
271 |                     pattern_part += "none"
272 |                 pattern_part += "</p>"
273 | 
274 |                 rc_html = "    <li>\n"
275 |                 rc_html+= "    "+timestamp+"\n"
276 |                 rc_html+= "    "+path1+"\n"
277 |                 rc_html+= "    "+path2+"\n"
278 |                 rc_html+= "    "+diff+"\n"
279 |                 rc_html+= "    "+culprit+"\n"
280 |                 rc_html+= "    "+pattern_part+"\n"
281 |                 if patterns: rc_html+= "    <ul>"+"".join(patterns)+"</ul>\n"
282 |                 rc_html+= "    </li>"
283 | 
284 |                 route_changes.append(rc_html)
285 | 
286 | 
287 |             c = "MediumSeaGreen" if (ev["pattern"] == "anomaly").any() else "Orange"
288 |             p0, p1 = prefix_key
289 |             prefix_title = f'<p>{text_color(p0,c)} -> {text_color(p1,c)}</p>'
290 |             route_change_part = "\n".join(route_changes)
291 | 
292 |             ev_html = "  <li>\n"
293 |             ev_html+= "  "+prefix_title+"\n"
294 |             ev_html+= "  <ul>\n"
295 |             ev_html+= route_change_part+"\n"
296 |             ev_html+= "  </ul>\n"
297 |             ev_html+= "  </li>\n"
298 | 
299 |             events.append(ev_html)
300 | 
301 |         mark = text_color("&#10004;", "MediumSeaGreen") \
302 |                 if (group["pattern"] == "anomaly").any() else \
303 |                 text_color('&#10007;', "Orange")
304 |         group_title = f"{mark} id: {group_id}, start: {start_time}, end: {end_time}, events: {len(events)}, route_changes: {group.shape[0]}"
305 |         events_part = "".join(events)
306 | 
307 |         html = f'<button class="collapsible">{group_title}</button>\n'
308 |         html+= '<div class="content">\n'
309 |         html+= '<ul>\n'
310 |         html+= events_part
311 |         html+= '</ul>\n'
312 |         html+= '</div>\n'
313 | 
314 |         return html
315 | 
316 |     def gen_jsonl():
317 |         anomaly_cnt = 0
318 |         lines = []
319 |         for group_id, group in df.groupby("group_id"):
320 |             if (group["pattern"] == "anomaly").any():
321 |                 anomaly_cnt += 1
322 |             jl = json_checkout(group_id, group)
323 |             lines.append(json.dumps(jl))
324 | 
325 |         with open(summary_dir/f"alarms_{collector}_{year}{month:02}.jsonl", "w") as f:
326 |             f.write("\n".join(lines)+"\n")
327 | 
328 |         print(f"total groups: {group_id+1}")
329 |         print(f"anomaly: {anomaly_cnt}")
330 | 
331 |     def terminal_display():
332 |         for group_id, group in df.groupby("group_id"):
333 |             terminal_checkout(group_id, group)
334 | 
335 |     def stats_checkout(df):
336 |         daily_cnts = np.zeros(calendar.monthrange(year, int(month))[1], dtype=int)
337 |         daily_cnts_a = daily_cnts.copy()
338 | 
339 |         days = [datetime.fromtimestamp(np.min(g["timestamp"])).day for _,g in df.groupby("group_id")]
340 |         days_a = [datetime.fromtimestamp(np.min(g["timestamp"])).day for _,g in df.groupby("group_id")
341 |                     if (g["pattern"] == "anomaly").any()]
342 | 
343 |         days, cnts = np.unique(days, return_counts=True)
344 |         daily_cnts[days-1] = cnts
345 | 
346 |         days_a, cnts_a = np.unique(days_a, return_counts=True)
347 |         daily_cnts_a[days_a-1] = cnts_a
348 | 
349 |         route_change_cnts = int(subprocess.run(f"wc -l {route_change_dir}/{year}{month:02}*.csv", shell=True,
350 |                                 stdout=subprocess.PIPE, encoding='UTF-8').stdout.strip().split()[-2])
351 | 
352 |         return daily_cnts, daily_cnts_a, route_change_cnts
353 | 
354 |     def gen_html():
355 |         sections = []
356 |         for group_id, group in df.groupby("group_id"):
357 |             html = group_html_checkout(group_id, group)
358 |             sections.append(html)
359 |         template = open(html_dir/"template_routeviews.html", "r").read()
360 |         html = template.replace("REPLACE_WITH_SECTIONS", "\n".join(sections))
361 |         html = html.replace("REPLACE_WITH_TITLE", f"{year}-{month} Report（RouteViews {collector}）")
362 |         
363 | 
364 |         daily_cnts, daily_cnts_a, route_change_cnts = stats_checkout(df)
365 |         xvalues = "["+", ".join([f"{i+1:02}" for i in range(calendar.monthrange(year, int(month))[1])])+"]"
366 |         yvalues_a = "["+", ".join([f"{i+1:02}" for i in daily_cnts_a])+"]"
367 |         yvalues_b = "["+", ".join([f"{i+1:02}" for i in daily_cnts-daily_cnts_a])+"]"
368 | 
369 |         html = html.replace("REPLACE_WITH_XVALUES", xvalues)
370 |         html = html.replace("REPLACE_WITH_YVALUES_A", yvalues_a)
371 |         html = html.replace("REPLACE_WITH_YVALUES_B", yvalues_b)
372 | 
373 |         exp = "<ul>\n"
374 |         exp += f"  <li><p>Route change total：{route_change_cnts:,}，Alarm total：{daily_cnts.sum():,}（daily {daily_cnts.mean():.2f}）。Among them，{daily_cnts_a.sum():,} show known anomalous patterns, {daily_cnts.sum()-daily_cnts_a.sum():,} unknown anomalous patterns</p></li>\n"
375 |         exp += "</ul>\n"
376 | 
377 |         html = html.replace("REPLACE_WITH_EXPLANATION", exp)
378 | 
379 |         open(html_dir/f"report_{collector}_{year}{month:02}.html", "w").write(html)
380 | 
381 |     gen_jsonl()
382 |     # terminal_display()
383 |     gen_html()
384 | 
385 | if __name__ == "__main__":
386 |     main()
387 | 


--------------------------------------------------------------------------------