├── wayback2csv
    ├── __init__.py
    └── wayback2csv.py
├── requirements.txt
├── .gitignore
├── examples
    ├── costco_couch_price.py
    ├── piedmont_atlanta_er_wait_times.py
    └── twitter_followers_over_time.py
├── setup.py
└── README.md


/wayback2csv/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | waybackpack>=0.4
2 | beautifulsoup4
3 | tqdm
4 | lxml


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | .DS_Store
 3 | .ipynb_checkpoints/
 4 | .wayback2csv/
 5 | followers over time.ipynb
 6 | wayback2csv.code-workspace
 7 | gettr_followers_over_time.py
 8 | twitter_followers_over_time.py
 9 | gab_followers_over_time.py
10 | *.pyc
11 | __pycache__/
12 | data/
13 | *.ipynb


--------------------------------------------------------------------------------
/examples/costco_couch_price.py:
--------------------------------------------------------------------------------
1 | from wayback2csv.wayback2csv import Wayback2Csv
2 | 
3 | from sys import argv
4 | import json
5 | 
6 | w2c = Wayback2Csv("wayback2csv YOUR EMAIL ADDRESS HERE", "http://www.costco.com/.product.100018645.html", from_date="2015")
7 | w2c.download()
8 | w2c.parse_html(".your-price .currency", lambda x: x.text.replace("$", '').replace(",", '') )
9 | w2c.to_csv(f"costco_couch_price_over_time.csv", ["Costco Couch"])


--------------------------------------------------------------------------------
/examples/piedmont_atlanta_er_wait_times.py:
--------------------------------------------------------------------------------
1 | from wayback2csv.wayback2csv import Wayback2Csv
2 | 
3 | from sys import argv
4 | import json
5 | 
6 | w2c = Wayback2Csv("wayback2csv YOUR EMAIL ADDRESS HERE", "https://www.piedmont.org/emergency-room-wait-times/emergency-room-wait-times", from_date="2015")
7 | w2c.download()
8 | w2c.parse_html("#ctl00_cphContent_ctl00_lblPiedmontAtlanta", lambda x: x.text )
9 | w2c.to_csv(f"piedmont_atlanta_er_wait_times_over_time.csv", ["Atlanta"])


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(name='wayback2csv',
 4 |       version='0.1',
 5 |       description='get time series data from archived versions of a page on the Wayback Machine',
 6 |       author='Jeremy B. Merrill',
 7 |       author_email='jeremy@jeremybmerrill.com',
 8 |       url='https://www.github.com/jeremybmerrill/wayback2csv/',
 9 |       packages=['wayback2csv'],
10 |       install_requires=[
11 |           'waybackpack>=0.4',
12 |           'beautifulsoup4',
13 |           'tqdm'
14 |       ],
15 |       )
16 | 


--------------------------------------------------------------------------------
/examples/twitter_followers_over_time.py:
--------------------------------------------------------------------------------
 1 | from wayback2csv.wayback2csv import Wayback2Csv
 2 | 
 3 | from sys import argv
 4 | import json
 5 | 
 6 | username = argv[1]
 7 | 
 8 | w2c = Wayback2Csv("wayback2csv YOUR EMAIL ADDRESS HERE", f"twitter.com/{username}", from_date="2022")
 9 | w2c.download()
10 | 
11 | # you can use multiple parsing strategies if files differ in format!
12 | 
13 | # some older (2022) Twitter files include the follower count in this div.
14 | w2c.parse_html(".ProfileNav-item--followers .ProfileNav-value", lambda x: x.get("data-count") if x.get("data-count") else "None")
15 | 
16 | 
17 | # some newer (2023) Twitter files instead include the follower count in JSON, which is extracted here.
18 | def parse_json(json_el):
19 |     try:
20 |         return json.loads(json_el.text)["author"]["interactionStatistic"][0]["userInteractionCount"]
21 |     except:
22 |         print(json.loads(json_el.text))
23 |         return 'None'
24 | w2c.parse_html_xpath("//script[contains(text(), 'userInteractionCount')]", parse_json)
25 | w2c.to_csv(f"{username}_twitter_followers_over_time.csv", [username])
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # wayback2csv
 2 | 
 3 | a module to get a set of URLs from the Internet Archive and parse out a specific datapoint to a CSV with the date.
 4 | 
 5 | like for getting follower counts historically from the internet archive.
 6 | 
 7 | e.g. 
 8 | 
 9 | ```
10 | from wayback2csv.wayback2csv import Wayback2Csv
11 | from sys import argv
12 | 
13 | username = argv[1]
14 | 
15 | w2c = Wayback2Csv("wayback2csv wayback2csv@example.com", f"twitter.com/{username}", from_date="2022")
16 | w2c.download()
17 | w2c.parse_html(".ProfileNav-item--followers .ProfileNav-value", lambda x: x.get("data-count") if x.get("data-count") else "None")
18 | w2c.to_csv(f"data/{username}_twitter_followers_over_time.csv", [username])
19 | ```
20 | 
21 | if you need to parse a file multiple ways (as with Twitter), just call parse_html multiple times. As long as each method fails when it's inappropriate, it'll be skipped.
22 | 
23 | ```
24 | from wayback2csv.wayback2csv import Wayback2Csv
25 | from sys import argv
26 | 
27 | username = argv[1]
28 | 
29 | w2c = Wayback2Csv("wayback2csv wayback2csv@example.com", f"twitter.com/{username}", from_date="2022")
30 | w2c.download()
31 | w2c.parse_html(".ProfileNav-item--followers .ProfileNav-value", lambda x: x.get("data-count") if x.get("data-count") else "None")
32 | w2c.parse_html("script[data-rh=true]", lambda x:[i for i in json.loads(x.text)["author"]["interactionStatistic"] if i["name"] == "Follows"][0]['userInteractionCount']) # just call it twice!
33 | w2c.to_csv(f"data/{username}_twitter_followers_over_time.csv", [username])
34 | ```
35 | 
36 | ## Install instructions
37 | 
38 | 1. git clone the repo and cd into the dir
39 | 2. `pip install -e .`
40 | 3. go to town!
41 | 


--------------------------------------------------------------------------------
/wayback2csv/wayback2csv.py:
--------------------------------------------------------------------------------
  1 | from operator import itemgetter
  2 | import os
  3 | from glob import glob
  4 | from datetime import datetime
  5 | import csv
  6 | import json
  7 | import logging
  8 | 
  9 | import requests
 10 | import waybackpack
 11 | from bs4 import BeautifulSoup
 12 | from lxml import html
 13 | from lxml.etree import ParserError as LxmlParserError
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | DEFAULT_PARENT_DIR = ".wayback2csv/"
 18 | DEFAULT_COLLAPSE = "timestamp:8"  # daily
 19 | 
 20 | 
 21 | class Wayback2Csv:
 22 | 
 23 |     def __init__(self, user_agent, url, from_date=None, to_date=None, collapse=DEFAULT_COLLAPSE, dir=DEFAULT_PARENT_DIR):
 24 |         session = waybackpack.Session(user_agent=user_agent)
 25 |         self.dir = dir
 26 | 
 27 |         snapshots = waybackpack.search(url,
 28 |                                        session=session,
 29 |                                        from_date=from_date,
 30 |                                        to_date=to_date,
 31 |                                        uniques_only=True,
 32 |                                        collapse=collapse
 33 |                                        )
 34 | 
 35 |         timestamps = [snap["timestamp"] for snap in snapshots]
 36 | 
 37 |         self.pack = waybackpack.Pack(
 38 |             url,
 39 |             timestamps=timestamps,
 40 |             session=session
 41 |         )
 42 |         self.values = []
 43 | 
 44 |     def download(self, ignore_errors=True):
 45 |         self.pack.download_to(
 46 |             self.dir,
 47 |             no_clobber=True,
 48 |             progress=True,
 49 |             ignore_errors=ignore_errors
 50 |         )
 51 | 
 52 |     def pack_files(self):
 53 |         for asset in self.pack.assets:
 54 |             # copy pasted from waybackpack/asset.py
 55 |             path_head, path_tail = os.path.split(self.pack.parsed_url.path)
 56 |             if path_tail == "":
 57 |                 path_tail = "index.html"
 58 | 
 59 |             filedir = os.path.join(
 60 |                 self.dir,
 61 |                 asset.timestamp,
 62 |                 self.pack.parsed_url.netloc,
 63 |                 path_head.lstrip("/")
 64 |             )
 65 | 
 66 |             fn = os.path.join(filedir, path_tail)
 67 |             if os.path.exists(fn):
 68 |                 yield fn
 69 | 
 70 |     def parse_html_xpath(self, xpath_selector, number_lambda=None):
 71 |         for fn in self.pack_files():
 72 |             with open(fn, 'r', errors="ignore") as f:
 73 |                 try:
 74 |                     html_doc = f.read()
 75 |                 except UnicodeDecodeError:
 76 |                     continue
 77 |                 try: 
 78 |                     tree = html.fromstring(html_doc)
 79 |                 except LxmlParserError:
 80 |                     continue
 81 |                 followers_values = tree.xpath(xpath_selector)
 82 |                 
 83 |                 if not followers_values:
 84 |                     logger.warn(
 85 |                         "couldn't find any elements matching %s in %s", xpath_selector, fn)
 86 |                     continue
 87 |                 followers = number_lambda(
 88 |                     followers_values[0]) if number_lambda else followers_values[0]
 89 |                 try:
 90 |                     count = float(followers)
 91 |                 except ValueError as e:
 92 |                     logger.warn(
 93 |                         "couldn't parse a float from %s in %s", followers, fn)
 94 |                     continue
 95 |                 raw_date = fn.split("/")[1]
 96 |                 scrape_date = datetime.strptime(raw_date[:8], "%Y%m%d")
 97 |                 self.values.append([fn, scrape_date, count])
 98 | 
 99 |     def parse_html(self, css_selector, number_lambda=None):
100 |         for fn in self.pack_files():
101 |             with open(fn, 'r', errors="ignore") as f:
102 |                 try:
103 |                     html_doc = f.read()
104 |                 except UnicodeDecodeError:
105 |                     continue
106 |                 soup = BeautifulSoup(html_doc, 'html.parser')
107 |                 
108 |                 followers_values = soup.select(css_selector)
109 |                 if not followers_values:
110 |                     logger.warn(
111 |                         "couldn't find any elements matching %s in %s", css_selector, fn)
112 |                     continue
113 |                 followers = number_lambda(
114 |                     followers_values[0]) if number_lambda else followers_values[0]
115 |                 try:
116 |                     count = float(followers)
117 |                 except ValueError as e:
118 |                     logger.warn(
119 |                         "couldn't parse a float from %s in %s", followers, fn)
120 |                     continue
121 |                 raw_date = fn.split("/")[1]
122 |                 scrape_date = datetime.strptime(raw_date[:8], "%Y%m%d")
123 |                 self.values.append([fn, scrape_date, count])
124 | 
125 |     def parse_json(self, path, number_lambda=None):
126 |         for fn in self.pack_files():
127 |             with open(fn, 'r') as f:
128 |                 try:
129 |                     json_doc = f.read()
130 |                 except UnicodeDecodeError:
131 |                     continue
132 |                 try:
133 |                     data = json.loads(json_doc)
134 |                 except json.decoder.JSONDecodeError:
135 |                     logger.error("JSON decode error {}".format(fn))
136 |                     continue
137 |                 count = drill_down_nested_dict(data, path)
138 |                 count = number_lambda(count) if number_lambda else count
139 |                 raw_date = fn.split("/")[1]
140 |                 scrape_date = datetime.strptime(raw_date[:8], "%Y%m%d")
141 |                 self.values.append([fn, scrape_date, count])
142 | 
143 |     def to_csv(self, outfn, extra_row_values=[]):
144 |         with open(outfn, 'w') as outf:
145 |             writer = csv.writer(outf)
146 |             writer.writerow(["path", "date", "value"] +
147 |                             [f"extra{n+1}" for n, _ in enumerate(extra_row_values)])
148 |             for row in sorted(self.values, key=itemgetter(1)):
149 |                 writer.writerow(row + extra_row_values)
150 | 
151 | 
152 | def drill_down_nested_dict(nested_dict, keys):
153 |     for key in keys:
154 |         nested_dict = nested_dict[key]
155 |     return nested_dict
156 | 


--------------------------------------------------------------------------------