├── requirements.txt
├── README.md
├── LICENSE
├── fetch_data.py
├── .gitignore
└── collect_htmls.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | tenacity
2 | beautifulsoup4
3 | requests


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # chyrons_fetcher
 2 | Simple Python Script to download "Third Eye Data: TV News Archive chyrons"
 3 | 
 4 | ## Example usage
 5 | 
 6 | Fetch chyrons from 2018/07/01 to 2018/08/31:
 7 | 
 8 | ```python
 9 | import asyncio
10 | from datetime import date
11 | 
12 | from fetch_data import main
13 | 
14 | if __name__ == "__main__":
15 |     try:
16 |         LOOP = asyncio.get_event_loop()
17 |         LOOP.run_until_complete(main(date(2018, 7, 1), date(2018, 8, 31)))
18 |     finally:
19 |         LOOP.close()
20 | ```
21 | 
22 | Downloaded files will be in the `data/` subfolder.
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 CeShine Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/fetch_data.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import concurrent.futures
 3 | from datetime import date, timedelta
 4 | from pathlib import Path
 5 | 
 6 | import requests
 7 | 
 8 | from collect_htmls import get_url
 9 | 
10 | OUTPUT_FOLDER = Path("data/")
11 | OUTPUT_FOLDER.mkdir(exist_ok=True, parents=True)
12 | 
13 | URL_PATTERN = "https://archive.org/download/third-eye/{}-tweets.tsv"
14 | 
15 | 
16 | def fetch_date(target_date):
17 |     date_formatted = target_date.strftime("%Y-%m-%d")
18 |     response = get_url(URL_PATTERN.format(date_formatted), use_headers=False)
19 |     with open(str(OUTPUT_FOLDER /
20 |                   (date_formatted + "-tweets.tsv")),
21 |               "w") as fout:
22 |         fout.write(response.text)
23 |     print(f"Downloaded {date_formatted}")
24 | 
25 | 
26 | async def main(base_date, end_date):
27 |     duration = (end_date - base_date).days + 1
28 |     executor = concurrent.futures.ThreadPoolExecutor(
29 |         max_workers=4
30 |     )
31 |     loop = asyncio.get_event_loop()
32 |     tasks = [
33 |         loop.run_in_executor(
34 |             executor, fetch_date, base_date + timedelta(days=i))
35 |         for i in range(duration)
36 |     ]
37 |     try:
38 |         await asyncio.gather(*tasks)
39 |     except Exception as err:
40 |         for task in tasks:
41 |             task.cancel()
42 |         raise err
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     try:
47 |         LOOP = asyncio.get_event_loop()
48 |         LOOP.run_until_complete(main(date(2018, 7, 1), date(2018, 8, 31)))
49 |     finally:
50 |         LOOP.close()
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/collect_htmls.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import html as ihtml
 4 | from typing import Dict
 5 | from urllib.parse import urlparse
 6 | 
 7 | from tenacity import retry, stop_after_attempt, wait_fixed
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | 
11 | 
12 | PROXY_URL = os.environ.get("SOCKS_PROXY", None)
13 | PROXIES: Dict = dict()
14 | if PROXY_URL is not None:
15 |     PROXIES = dict(http=f'socks5h://{PROXY_URL}',
16 |                    https=f'socks5h://{PROXY_URL}')
17 | HEADERS = {
18 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'}
19 | LOGGER = logging.getLogger(__name__)
20 | 
21 | 
22 | def meta_redirect(content, original_link):
23 |     soup = BeautifulSoup(content, "html.parser")
24 |     result = soup.find("meta", attrs={"http-equiv": "refresh"})
25 |     if result:
26 |         try:
27 |             _, text = ihtml.unescape(result["content"]).split(";")
28 |         except ValueError:
29 |             LOGGER.warning(
30 |                 "WARNING: malformed redirect dectcted: %s", result['content'])
31 |             return None
32 |         text = text.strip().lower()
33 |         if text.startswith("url="):
34 |             url = text[4:].strip("\"\'")
35 |             if not url.startswith("http"):
36 |                 base_url = '{uri.scheme}://{uri.netloc}'.format(
37 |                     uri=urlparse(original_link))
38 |                 url = base_url + url
39 |             return url
40 |     return None
41 | 
42 | 
43 | @retry(reraise=True, stop=stop_after_attempt(5), wait=wait_fixed(1))
44 | def get_url(url, use_headers=True, timeout=10):
45 |     LOGGER.debug("Retrieving of %s", url)
46 |     headers = HEADERS if use_headers else {}
47 |     response = requests.get(url, proxies=PROXIES,
48 |                             headers=headers, timeout=timeout)
49 |     new_url = meta_redirect(response.text, url)
50 |     if new_url:
51 |         LOGGER.debug("Meta refresh detected. Retrieving %s...", new_url)
52 |         return requests.get(new_url, proxies=PROXIES, headers=HEADERS, timeout=timeout)
53 |     return response
54 | 


--------------------------------------------------------------------------------