├── requirements.txt ├── README.md ├── LICENSE ├── fetch_data.py ├── .gitignore └── collect_htmls.py /requirements.txt: -------------------------------------------------------------------------------- 1 | tenacity 2 | beautifulsoup4 3 | requests -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chyrons_fetcher 2 | Simple Python Script to download "Third Eye Data: TV News Archive chyrons" 3 | 4 | ## Example usage 5 | 6 | Fetch chyrons from 2018/07/01 to 2018/08/31: 7 | 8 | ```python 9 | import asyncio 10 | from datetime import date 11 | 12 | from fetch_data import main 13 | 14 | if __name__ == "__main__": 15 | try: 16 | LOOP = asyncio.get_event_loop() 17 | LOOP.run_until_complete(main(date(2018, 7, 1), date(2018, 8, 31))) 18 | finally: 19 | LOOP.close() 20 | ``` 21 | 22 | Downloaded files will be in the `data/` subfolder. 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 CeShine Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /fetch_data.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import concurrent.futures 3 | from datetime import date, timedelta 4 | from pathlib import Path 5 | 6 | import requests 7 | 8 | from collect_htmls import get_url 9 | 10 | OUTPUT_FOLDER = Path("data/") 11 | OUTPUT_FOLDER.mkdir(exist_ok=True, parents=True) 12 | 13 | URL_PATTERN = "https://archive.org/download/third-eye/{}-tweets.tsv" 14 | 15 | 16 | def fetch_date(target_date): 17 | date_formatted = target_date.strftime("%Y-%m-%d") 18 | response = get_url(URL_PATTERN.format(date_formatted), use_headers=False) 19 | with open(str(OUTPUT_FOLDER / 20 | (date_formatted + "-tweets.tsv")), 21 | "w") as fout: 22 | fout.write(response.text) 23 | print(f"Downloaded {date_formatted}") 24 | 25 | 26 | async def main(base_date, end_date): 27 | duration = (end_date - base_date).days + 1 28 | executor = concurrent.futures.ThreadPoolExecutor( 29 | max_workers=4 30 | ) 31 | loop = asyncio.get_event_loop() 32 | tasks = [ 33 | loop.run_in_executor( 34 | executor, fetch_date, base_date + timedelta(days=i)) 35 | for i in range(duration) 36 | ] 37 | try: 38 | await asyncio.gather(*tasks) 39 | except Exception as err: 40 | for task in tasks: 41 | task.cancel() 42 | raise err 43 | 44 | 45 | if __name__ == "__main__": 46 | try: 47 | LOOP = asyncio.get_event_loop() 48 | LOOP.run_until_complete(main(date(2018, 7, 1), date(2018, 8, 31))) 49 | finally: 50 | LOOP.close() 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /collect_htmls.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import html as ihtml 4 | from typing import Dict 5 | from urllib.parse import urlparse 6 | 7 | from tenacity import retry, stop_after_attempt, wait_fixed 8 | import requests 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | PROXY_URL = os.environ.get("SOCKS_PROXY", None) 13 | PROXIES: Dict = dict() 14 | if PROXY_URL is not None: 15 | PROXIES = dict(http=f'socks5h://{PROXY_URL}', 16 | https=f'socks5h://{PROXY_URL}') 17 | HEADERS = { 18 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'} 19 | LOGGER = logging.getLogger(__name__) 20 | 21 | 22 | def meta_redirect(content, original_link): 23 | soup = BeautifulSoup(content, "html.parser") 24 | result = soup.find("meta", attrs={"http-equiv": "refresh"}) 25 | if result: 26 | try: 27 | _, text = ihtml.unescape(result["content"]).split(";") 28 | except ValueError: 29 | LOGGER.warning( 30 | "WARNING: malformed redirect dectcted: %s", result['content']) 31 | return None 32 | text = text.strip().lower() 33 | if text.startswith("url="): 34 | url = text[4:].strip("\"\'") 35 | if not url.startswith("http"): 36 | base_url = '{uri.scheme}://{uri.netloc}'.format( 37 | uri=urlparse(original_link)) 38 | url = base_url + url 39 | return url 40 | return None 41 | 42 | 43 | @retry(reraise=True, stop=stop_after_attempt(5), wait=wait_fixed(1)) 44 | def get_url(url, use_headers=True, timeout=10): 45 | LOGGER.debug("Retrieving of %s", url) 46 | headers = HEADERS if use_headers else {} 47 | response = requests.get(url, proxies=PROXIES, 48 | headers=headers, timeout=timeout) 49 | new_url = meta_redirect(response.text, url) 50 | if new_url: 51 | LOGGER.debug("Meta refresh detected. Retrieving %s...", new_url) 52 | return requests.get(new_url, proxies=PROXIES, headers=HEADERS, timeout=timeout) 53 | return response 54 | --------------------------------------------------------------------------------