├── wayback2csv ├── __init__.py └── wayback2csv.py ├── requirements.txt ├── .gitignore ├── examples ├── costco_couch_price.py ├── piedmont_atlanta_er_wait_times.py └── twitter_followers_over_time.py ├── setup.py └── README.md /wayback2csv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | waybackpack>=0.4 2 | beautifulsoup4 3 | tqdm 4 | lxml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | .DS_Store 3 | .ipynb_checkpoints/ 4 | .wayback2csv/ 5 | followers over time.ipynb 6 | wayback2csv.code-workspace 7 | gettr_followers_over_time.py 8 | twitter_followers_over_time.py 9 | gab_followers_over_time.py 10 | *.pyc 11 | __pycache__/ 12 | data/ 13 | *.ipynb -------------------------------------------------------------------------------- /examples/costco_couch_price.py: -------------------------------------------------------------------------------- 1 | from wayback2csv.wayback2csv import Wayback2Csv 2 | 3 | from sys import argv 4 | import json 5 | 6 | w2c = Wayback2Csv("wayback2csv YOUR EMAIL ADDRESS HERE", "http://www.costco.com/.product.100018645.html", from_date="2015") 7 | w2c.download() 8 | w2c.parse_html(".your-price .currency", lambda x: x.text.replace("$", '').replace(",", '') ) 9 | w2c.to_csv(f"costco_couch_price_over_time.csv", ["Costco Couch"]) -------------------------------------------------------------------------------- /examples/piedmont_atlanta_er_wait_times.py: -------------------------------------------------------------------------------- 1 | from wayback2csv.wayback2csv import Wayback2Csv 2 | 3 | from sys import argv 4 | import json 5 | 6 | w2c = Wayback2Csv("wayback2csv YOUR EMAIL ADDRESS HERE", "https://www.piedmont.org/emergency-room-wait-times/emergency-room-wait-times", from_date="2015") 7 | w2c.download() 8 | w2c.parse_html("#ctl00_cphContent_ctl00_lblPiedmontAtlanta", lambda x: x.text ) 9 | w2c.to_csv(f"piedmont_atlanta_er_wait_times_over_time.csv", ["Atlanta"]) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='wayback2csv', 4 | version='0.1', 5 | description='get time series data from archived versions of a page on the Wayback Machine', 6 | author='Jeremy B. Merrill', 7 | author_email='jeremy@jeremybmerrill.com', 8 | url='https://www.github.com/jeremybmerrill/wayback2csv/', 9 | packages=['wayback2csv'], 10 | install_requires=[ 11 | 'waybackpack>=0.4', 12 | 'beautifulsoup4', 13 | 'tqdm' 14 | ], 15 | ) 16 | -------------------------------------------------------------------------------- /examples/twitter_followers_over_time.py: -------------------------------------------------------------------------------- 1 | from wayback2csv.wayback2csv import Wayback2Csv 2 | 3 | from sys import argv 4 | import json 5 | 6 | username = argv[1] 7 | 8 | w2c = Wayback2Csv("wayback2csv YOUR EMAIL ADDRESS HERE", f"twitter.com/{username}", from_date="2022") 9 | w2c.download() 10 | 11 | # you can use multiple parsing strategies if files differ in format! 12 | 13 | # some older (2022) Twitter files include the follower count in this div. 14 | w2c.parse_html(".ProfileNav-item--followers .ProfileNav-value", lambda x: x.get("data-count") if x.get("data-count") else "None") 15 | 16 | 17 | # some newer (2023) Twitter files instead include the follower count in JSON, which is extracted here. 18 | def parse_json(json_el): 19 | try: 20 | return json.loads(json_el.text)["author"]["interactionStatistic"][0]["userInteractionCount"] 21 | except: 22 | print(json.loads(json_el.text)) 23 | return 'None' 24 | w2c.parse_html_xpath("//script[contains(text(), 'userInteractionCount')]", parse_json) 25 | w2c.to_csv(f"{username}_twitter_followers_over_time.csv", [username]) 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wayback2csv 2 | 3 | a module to get a set of URLs from the Internet Archive and parse out a specific datapoint to a CSV with the date. 4 | 5 | like for getting follower counts historically from the internet archive. 6 | 7 | e.g. 8 | 9 | ``` 10 | from wayback2csv.wayback2csv import Wayback2Csv 11 | from sys import argv 12 | 13 | username = argv[1] 14 | 15 | w2c = Wayback2Csv("wayback2csv wayback2csv@example.com", f"twitter.com/{username}", from_date="2022") 16 | w2c.download() 17 | w2c.parse_html(".ProfileNav-item--followers .ProfileNav-value", lambda x: x.get("data-count") if x.get("data-count") else "None") 18 | w2c.to_csv(f"data/{username}_twitter_followers_over_time.csv", [username]) 19 | ``` 20 | 21 | if you need to parse a file multiple ways (as with Twitter), just call parse_html multiple times. As long as each method fails when it's inappropriate, it'll be skipped. 22 | 23 | ``` 24 | from wayback2csv.wayback2csv import Wayback2Csv 25 | from sys import argv 26 | 27 | username = argv[1] 28 | 29 | w2c = Wayback2Csv("wayback2csv wayback2csv@example.com", f"twitter.com/{username}", from_date="2022") 30 | w2c.download() 31 | w2c.parse_html(".ProfileNav-item--followers .ProfileNav-value", lambda x: x.get("data-count") if x.get("data-count") else "None") 32 | w2c.parse_html("script[data-rh=true]", lambda x:[i for i in json.loads(x.text)["author"]["interactionStatistic"] if i["name"] == "Follows"][0]['userInteractionCount']) # just call it twice! 33 | w2c.to_csv(f"data/{username}_twitter_followers_over_time.csv", [username]) 34 | ``` 35 | 36 | ## Install instructions 37 | 38 | 1. git clone the repo and cd into the dir 39 | 2. `pip install -e .` 40 | 3. go to town! 41 | -------------------------------------------------------------------------------- /wayback2csv/wayback2csv.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | import os 3 | from glob import glob 4 | from datetime import datetime 5 | import csv 6 | import json 7 | import logging 8 | 9 | import requests 10 | import waybackpack 11 | from bs4 import BeautifulSoup 12 | from lxml import html 13 | from lxml.etree import ParserError as LxmlParserError 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | DEFAULT_PARENT_DIR = ".wayback2csv/" 18 | DEFAULT_COLLAPSE = "timestamp:8" # daily 19 | 20 | 21 | class Wayback2Csv: 22 | 23 | def __init__(self, user_agent, url, from_date=None, to_date=None, collapse=DEFAULT_COLLAPSE, dir=DEFAULT_PARENT_DIR): 24 | session = waybackpack.Session(user_agent=user_agent) 25 | self.dir = dir 26 | 27 | snapshots = waybackpack.search(url, 28 | session=session, 29 | from_date=from_date, 30 | to_date=to_date, 31 | uniques_only=True, 32 | collapse=collapse 33 | ) 34 | 35 | timestamps = [snap["timestamp"] for snap in snapshots] 36 | 37 | self.pack = waybackpack.Pack( 38 | url, 39 | timestamps=timestamps, 40 | session=session 41 | ) 42 | self.values = [] 43 | 44 | def download(self, ignore_errors=True): 45 | self.pack.download_to( 46 | self.dir, 47 | no_clobber=True, 48 | progress=True, 49 | ignore_errors=ignore_errors 50 | ) 51 | 52 | def pack_files(self): 53 | for asset in self.pack.assets: 54 | # copy pasted from waybackpack/asset.py 55 | path_head, path_tail = os.path.split(self.pack.parsed_url.path) 56 | if path_tail == "": 57 | path_tail = "index.html" 58 | 59 | filedir = os.path.join( 60 | self.dir, 61 | asset.timestamp, 62 | self.pack.parsed_url.netloc, 63 | path_head.lstrip("/") 64 | ) 65 | 66 | fn = os.path.join(filedir, path_tail) 67 | if os.path.exists(fn): 68 | yield fn 69 | 70 | def parse_html_xpath(self, xpath_selector, number_lambda=None): 71 | for fn in self.pack_files(): 72 | with open(fn, 'r', errors="ignore") as f: 73 | try: 74 | html_doc = f.read() 75 | except UnicodeDecodeError: 76 | continue 77 | try: 78 | tree = html.fromstring(html_doc) 79 | except LxmlParserError: 80 | continue 81 | followers_values = tree.xpath(xpath_selector) 82 | 83 | if not followers_values: 84 | logger.warn( 85 | "couldn't find any elements matching %s in %s", xpath_selector, fn) 86 | continue 87 | followers = number_lambda( 88 | followers_values[0]) if number_lambda else followers_values[0] 89 | try: 90 | count = float(followers) 91 | except ValueError as e: 92 | logger.warn( 93 | "couldn't parse a float from %s in %s", followers, fn) 94 | continue 95 | raw_date = fn.split("/")[1] 96 | scrape_date = datetime.strptime(raw_date[:8], "%Y%m%d") 97 | self.values.append([fn, scrape_date, count]) 98 | 99 | def parse_html(self, css_selector, number_lambda=None): 100 | for fn in self.pack_files(): 101 | with open(fn, 'r', errors="ignore") as f: 102 | try: 103 | html_doc = f.read() 104 | except UnicodeDecodeError: 105 | continue 106 | soup = BeautifulSoup(html_doc, 'html.parser') 107 | 108 | followers_values = soup.select(css_selector) 109 | if not followers_values: 110 | logger.warn( 111 | "couldn't find any elements matching %s in %s", css_selector, fn) 112 | continue 113 | followers = number_lambda( 114 | followers_values[0]) if number_lambda else followers_values[0] 115 | try: 116 | count = float(followers) 117 | except ValueError as e: 118 | logger.warn( 119 | "couldn't parse a float from %s in %s", followers, fn) 120 | continue 121 | raw_date = fn.split("/")[1] 122 | scrape_date = datetime.strptime(raw_date[:8], "%Y%m%d") 123 | self.values.append([fn, scrape_date, count]) 124 | 125 | def parse_json(self, path, number_lambda=None): 126 | for fn in self.pack_files(): 127 | with open(fn, 'r') as f: 128 | try: 129 | json_doc = f.read() 130 | except UnicodeDecodeError: 131 | continue 132 | try: 133 | data = json.loads(json_doc) 134 | except json.decoder.JSONDecodeError: 135 | logger.error("JSON decode error {}".format(fn)) 136 | continue 137 | count = drill_down_nested_dict(data, path) 138 | count = number_lambda(count) if number_lambda else count 139 | raw_date = fn.split("/")[1] 140 | scrape_date = datetime.strptime(raw_date[:8], "%Y%m%d") 141 | self.values.append([fn, scrape_date, count]) 142 | 143 | def to_csv(self, outfn, extra_row_values=[]): 144 | with open(outfn, 'w') as outf: 145 | writer = csv.writer(outf) 146 | writer.writerow(["path", "date", "value"] + 147 | [f"extra{n+1}" for n, _ in enumerate(extra_row_values)]) 148 | for row in sorted(self.values, key=itemgetter(1)): 149 | writer.writerow(row + extra_row_values) 150 | 151 | 152 | def drill_down_nested_dict(nested_dict, keys): 153 | for key in keys: 154 | nested_dict = nested_dict[key] 155 | return nested_dict 156 | --------------------------------------------------------------------------------