├── .github └── dependabot.yml ├── .gitignore ├── LICENSE ├── README.md ├── crawl.py ├── file2mongo.py ├── legacy ├── __init__.py ├── crawler │ ├── crawler.py │ ├── improved_crawler.py │ └── utils.py └── scripts │ ├── __init__.py │ ├── activecyber.py │ ├── andrewhay.py │ ├── australiancybersecuritymagazine.py │ ├── betanews.py │ ├── bromium.py │ ├── cfobase.py │ ├── chinainternetwatch.py │ ├── comparitech.py │ ├── crawled_list.csv │ ├── cuphk.py │ ├── cybersecuritynews.py │ ├── cybersecurityreview.py │ ├── cyberthreat.py │ ├── helpnetsecurity.py │ ├── hotforsecurity.py │ ├── informationage.py │ ├── infosecblog.py │ ├── itsecurity.py │ ├── lastwatchdog.py │ ├── liquidmatrix.py │ ├── lookingglasscyber.py │ ├── malwarebytes.py │ ├── martinoei.py │ ├── riskiq.py │ ├── robertpenz.py │ ├── scmagazine.py │ ├── securelist.py │ ├── securingtomorrow.py │ ├── securityaffairs.py │ ├── securityboulevard.py │ ├── securityledger.py │ ├── securityweekly.py │ ├── sensorstechforum.py │ ├── socialengineer.py │ ├── sucuri.py │ ├── techcrunch.py │ ├── techlear.py │ ├── threatpost.py │ ├── tripwire.py │ ├── trustedsec.py │ ├── unwire.py │ ├── vipre.py │ └── wired.py ├── legacy_crawl_all.py ├── legacy_main.py ├── requirements.txt └── wpscraper ├── connector.py ├── crawler.py ├── document.py ├── headers.py ├── session.py └── utils.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "21:00" 8 | open-pull-requests-limit: 10 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | venv/ 3 | __pycache__/ 4 | *.ipynb 5 | .idea/ 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Orix Au Yeung 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wordpress-scraper 2 | 3 | ## Description 4 | 5 | Simple, easy-to-use scraper to scrape data from WordPress JSON API 6 | 7 | ### Features 8 | - Support storing crawled documents as MongoDB documents / JSON files 9 | - Auto retry upon errors 10 | 11 | ## Requirements 12 | 13 | - Python 3.7+ 14 | 15 | ## Installation 16 | 17 | ```bash 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | ## How to use 22 | 23 | ### Basic 24 | 25 | Just run `crawl.py` with the sites URL supplied: 26 | 27 | ```bash 28 | python3 crawl.py https://your.website.here 29 | ``` 30 | 31 | This will crawl the site using `DefaultCrawlSession`, which attempts to crawl all `posts`, `categories` & `tags` from the site. 32 | 33 | The crawled JSON files will be stored in the directory `./data/`. 34 | 35 | Most of the time, This will suffice when scraping sites that are: 36 | 1. not required to sign in 37 | 2. JSON API paths not blocked 38 | 39 | 40 | ### Advanced 41 | For advanced usage and customizations you may want to look at `wpscraper/session.py` for actual crawling procedures, and make your own `CrawlSession`. 42 | 43 | ## Upcoming Features 44 | 45 | - [x] Rewrite/Refactor 46 | - [x] MongoDB Connector 47 | - [ ] Async session 48 | - [ ] Authentication Module 49 | - [ ] Cloudflare circumvention 50 | - [ ] Configurable retry policies 51 | - [ ] Full WPv2 API resources support 52 | -------------------------------------------------------------------------------- /crawl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from wpscraper.session import DefaultCrawlSession 4 | 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("url") 9 | args = parser.parse_args() 10 | 11 | session = DefaultCrawlSession(args.url) 12 | session.execute() 13 | -------------------------------------------------------------------------------- /file2mongo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | from wpscraper.connector import MongoDBConnector 6 | from wpscraper.document import RawDocument 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("filepath", type=str) 10 | parser.add_argument("db_host", type=str) 11 | parser.add_argument("db_port", type=int) 12 | parser.add_argument("db_database", type=str) 13 | parser.add_argument("db_collection", type=str) 14 | parser.add_argument("username", type=str) 15 | parser.add_argument("password", type=str) 16 | 17 | 18 | def files_to_mongodb(filepath: str, db_host: str, db_port: int, db_database: str, db_collection: str, 19 | username: str, password: str, **kwargs): 20 | files = [os.path.join(filepath, x) for x in os.listdir(filepath) if x.split(".")[-1].lower() == "json"] 21 | c = MongoDBConnector(db_host=db_host, db_port=db_port, db_database=db_database, db_collection=db_collection, 22 | username=username, password=password, **kwargs) 23 | for file in files: 24 | with open(file, 'r') as f: 25 | json_obj = json.load(f) 26 | doc = RawDocument(raw_data=json_obj) 27 | resource = doc.data['resource_type'] 28 | c.process_document(doc, resource) 29 | print("{} uploaded.".format(file)) 30 | print("done.") 31 | 32 | 33 | if __name__ == "__main__": 34 | args = parser.parse_args() 35 | files_to_mongodb(filepath=args.filepath, db_host=args.db_host, db_port=args.db_port, db_database=args.db_database, 36 | db_collection=args.db_collection, username=args.username, password=args.password) 37 | -------------------------------------------------------------------------------- /legacy/__init__.py: -------------------------------------------------------------------------------- 1 | common_header = { 2 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 3 | 'Accept-Encoding': '*', 4 | 'Accept-Language': 'zh-CN,zh;q=0.8', 5 | 'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371', 6 | 'Connection': 'keep-alive', 7 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 8 | 'X-Requested-With': 'XMLHttpRequest' 9 | } 10 | 11 | 12 | def common_crawl(wpc): 13 | wpc.get_tags() 14 | wpc.get_categories() 15 | wpc.get_posts() 16 | -------------------------------------------------------------------------------- /legacy/crawler/crawler.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import os 4 | 5 | import requests 6 | 7 | from legacy.crawler import utils 8 | 9 | 10 | class WordPressCrawler: 11 | def __init__(self, url, headers, output_dir, crawl_rate=25, verify_ssl=True, timeout=30, retry_standoff=30, 12 | max_retries=5): 13 | self.api = url + '/wp-json/wp/v2' 14 | self.headers = headers 15 | self.crawl_rate = crawl_rate 16 | self.verify_ssl = verify_ssl 17 | self.output_dir = output_dir 18 | self.timeout = timeout 19 | self.retry_standoff = retry_standoff 20 | self.max_retries = max_retries 21 | # self.session = HTMLSession() 22 | 23 | def _get(self, path, output_file): 24 | json_output = self._crawl_jsons(self.api + path) 25 | if output_file: 26 | utils.dump_json(json_output, output_file) 27 | return json_output 28 | 29 | def get_categories(self, output_file=None): 30 | if not output_file: 31 | output_file = os.path.join(self.output_dir, "cats.json") 32 | return self._get('/categories', output_file) 33 | 34 | def get_tags(self, output_file=None): 35 | if not output_file: 36 | output_file = os.path.join(self.output_dir, "tags.json") 37 | return self._get('/tags', output_file) 38 | 39 | def get_posts(self, output_file=None): 40 | if not output_file: 41 | output_file = os.path.join(self.output_dir, "posts.json") 42 | return self._get('/posts', output_file) 43 | 44 | def _isjsonarray(self, json): 45 | return json and isinstance(json, list) 46 | 47 | def set_output_dir(self, output_dir): 48 | self.output_dir = output_dir 49 | 50 | def _crawl_jsons(self, url): 51 | output = [] 52 | i = 1 53 | while True: 54 | json_repsonse = self._get_json_response('{}?per_page={}&page={}'.format(url, self.crawl_rate, i)) 55 | if self._isjsonarray(json_repsonse): 56 | output += json_repsonse 57 | i += 1 58 | else: 59 | break 60 | return output 61 | 62 | def _get_json_response(self, url): 63 | retries = 1 64 | while retries <= self.max_retries: 65 | print("attempt #{} of {} - {}".format(retries, self.max_retries, url)) 66 | try: 67 | response = requests.get(url, headers=self.headers, timeout=self.timeout, 68 | verify=self.verify_ssl) # 30 seconds 69 | print('response code: {}'.format(response.status_code)) 70 | print('response head: {}'.format(response.text[:300])) 71 | # some API return valid response despite code 400 (weird I know) 72 | if response.status_code <= 400: 73 | # return json.loads(response.iter_lines()) 74 | return json.loads(response.content.decode('utf-8').strip('\n').strip(' ')) 75 | else: 76 | print("status code returned {}".format(response.status_code)) 77 | except requests.Timeout: 78 | print("Timed out.") 79 | except Exception as e: 80 | print("Exception occurred:\nError Type: {}\nDetails: {}".format(type(e), str(e))) 81 | retries += 1 82 | print("waiting for {} seconds...".format(self.retry_standoff)) 83 | time.sleep(self.retry_standoff) 84 | print("max no. of retries reached. exiting...") 85 | return None 86 | -------------------------------------------------------------------------------- /legacy/crawler/improved_crawler.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import os 4 | from multiprocessing import Pool 5 | 6 | import requests 7 | 8 | from legacy.crawler import utils 9 | 10 | 11 | class MultiThreadedCrawler: 12 | def __init__(self, url, headers, output_dir, crawl_rate=25, verify_ssl=True, timeout=30, retry_standoff=30, 13 | max_retries=5): 14 | self.api = url + '/wp-json/wp/v2' 15 | self.headers = headers 16 | self.crawl_rate = crawl_rate 17 | self.verify_ssl = verify_ssl 18 | self.output_dir = output_dir 19 | self.timeout = timeout 20 | self.retry_standoff = retry_standoff 21 | self.max_retries = max_retries 22 | self.result_dump = [] 23 | 24 | def _get(self, path, output_file): 25 | json_output = self._crawl_jsons(self.api + path) 26 | if output_file: 27 | utils.dump_json(json_output, output_file) 28 | return json_output 29 | 30 | def get_categories(self, output_file=None): 31 | if not output_file: 32 | output_file = os.path.join(self.output_dir, "cats.json") 33 | return self._get('/categories', output_file) 34 | 35 | def get_tags(self, output_file=None): 36 | if not output_file: 37 | output_file = os.path.join(self.output_dir, "tags.json") 38 | return self._get('/tags', output_file) 39 | 40 | def get_posts(self, output_file=None): 41 | if not output_file: 42 | output_file = os.path.join(self.output_dir, "posts.json") 43 | return self._get('/posts', output_file) 44 | 45 | def _isjsonarray(self, json): 46 | return json and isinstance(json, list) 47 | 48 | def set_output_dir(self, output_dir): 49 | self.output_dir = output_dir 50 | 51 | def _crawl_jsons(self, url): 52 | output = [] 53 | i = 1 54 | while True: 55 | urls = ['{}?per_page={}&page={}'.format(url, 1, x) for x in range(i, i+self.crawl_rate, 1)] 56 | with Pool(self.crawl_rate) as p: 57 | json_responses = p.map(self._get_json_response, urls) 58 | json_responses = list(filter(None, json_responses)) 59 | if self._isjsonarray(json_responses): 60 | output += json_responses 61 | i += self.crawl_rate 62 | else: 63 | break 64 | return output 65 | 66 | def _get_json_response(self, url): 67 | retries = 1 68 | while retries <= self.max_retries: 69 | print("attempt #{} of {} - {}".format(retries, self.max_retries, url)) 70 | try: 71 | response = requests.get(url, headers=self.headers, timeout=self.timeout, 72 | verify=self.verify_ssl) # 30 seconds 73 | print('response code: {}'.format(response.status_code)) 74 | print('response head: {}'.format(response.text[:300])) 75 | # some API return valid response despite code 400 (weird I know) 76 | if response.status_code <= 400: 77 | return json.loads(response.content.decode('utf-8').strip('\n').strip(' '))[0] 78 | else: 79 | print("status code returned {}".format(response.status_code)) 80 | except requests.Timeout: 81 | print("Timed out.") 82 | except Exception as e: 83 | print("Exception occurred:\nError Type: {}\nDetails: {}".format(type(e), str(e))) 84 | retries += 1 85 | print("waiting for {} seconds...".format(self.retry_standoff)) 86 | time.sleep(self.retry_standoff) 87 | print("max no. of retries reached. exiting...") 88 | return None 89 | -------------------------------------------------------------------------------- /legacy/crawler/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | 5 | 6 | def ensure_file_directory(file): 7 | inception(os.path.dirname(os.path.realpath(file))) 8 | 9 | 10 | def inception(directory): 11 | parent_directory = os.path.dirname(directory) 12 | if not os.path.exists(parent_directory): 13 | inception(parent_directory) 14 | if not os.path.exists(directory): 15 | os.makedirs(directory) 16 | 17 | 18 | def dump_json(json_object,output_file): 19 | ensure_file_directory(output_file) 20 | with open(output_file, 'w') as f: 21 | f.write(json.dumps(json_object)) 22 | 23 | 24 | def remove_leading_scripts(response_text): 25 | cleaned_text = re.search(r'(\[.*)', response_text) 26 | return cleaned_text.group(1) 27 | -------------------------------------------------------------------------------- /legacy/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoloSynth1/wordpress-scraper/61c1c832fe072b6e692e0c4c27c66d7dd2fc9920/legacy/scripts/__init__.py -------------------------------------------------------------------------------- /legacy/scripts/activecyber.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.activecyber.net" 5 | output_dir = os.path.join('.', 'data', 'activecyber.net') 6 | headers = { 7 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding': '*', 9 | 'Accept-Language': 'zh-CN,zh;q=0.8', 10 | 'Host': 'www.activecyber.net', 11 | 'Connection': 'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With': 'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/andrewhay.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.andrewhay.ca" 5 | output_dir = os.path.join('.', 'data', 'andrewhay.ca') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.244034378.1540788304; _gid=GA1.2.222331079.1540788304', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/australiancybersecuritymagazine.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://australiancybersecuritymagazine.com.au" 5 | output_dir = os.path.join('.', 'data', 'australiancybersecuritymagazine.com.au') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'australiancybersecuritymagazine.com.au', 11 | 'Connection':'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/betanews.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://betanews.com" 5 | output_dir = os.path.join('.', 'data', 'betanews.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.529771702.1541147410; __gads=ID=cddffd965b59ec5f:T=1541147412:S=ALNI_MYQm3Vphovvte59_MWZctfuIFxpmg; zdbb_swap_krux_id=1; _gid=GA1.2.1535543751.1541383876; geoCC=HK', 11 | 'Host': 'betanews.com', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/bromium.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "http://www.bromium.com" 5 | output_dir = os.path.join('.', 'data', 'bromium.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'www.bromium.com', 11 | 'Connection':'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/cfobase.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://cfobase.com/" 5 | output_dir = os.path.join('.', 'data', 'cfobase.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'cfobase.com', 11 | 'Connection':'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/chinainternetwatch.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.chinainternetwatch.com" 5 | output_dir = os.path.join('.', 'data', 'chinainternetwatch.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/comparitech.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.comparitech.com" 5 | output_dir = os.path.join('.', 'data', 'www.comparitech.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_conv_r=s:heimdalsecurity.com*m:referral*t:*c:; _ga=GA1.2.235624317.1541147443; PHPSESSID=4865b95b6bd5a85c6e071c1e2f47497f; _gid=GA1.2.595733218.1541383875; _conv_v=vi:1*sc:4*cs:1541383837*fs:1541147440*pv:5*exp:{}*ps:1541221464; _conv_s=si:4*sh:1541383837127-0.002642356216356223*pv:2; _ceg.s=php9gm; _ceg.u=php9gm', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/crawled_list.csv: -------------------------------------------------------------------------------- 1 | url,output_dir 2 | "https://australiancybersecuritymagazine.com.au","./data/australiancybersecuritymagazine.com.au" 3 | "https://cyberriskleaders.com","./data/cyberriskleaders.com" 4 | "https://techcrunch.com","./data/techcrunch.com" 5 | "https://www.wired.com","./data/wired.com" 6 | "https://www.varonis.com/blog","./data/www.varonis.com" 7 | "https://www.asiapacificsecuritymagazine.com","./data/www.asiapacificsecuritymagazine.com" 8 | "https://www.nozominetworks.com","./data/www.nozominetworks.com" 9 | "https://www.delapcpa.com","./data/delapcyber.com" 10 | "https://hub.packtpub.com","./data/hub.packtpub.com" 11 | "https://irishtechnews.ie","./data/irishtechnews.ie" 12 | "https://amuedge.com","./data/incyberdefense.com" 13 | "https://www.incidentresponse.com","./data/www.incidentresponse.com" 14 | "https://www.informationsecuritybuzz.com/","./data/www.informationsecuritybuzz.com" 15 | "https://www.cbronline.com","./data/www.cbronline.com" 16 | "https://www.cyberscoop.com","./data/www.cyberscoop.com" 17 | "https://www.fedscoop.com","./data/www.fedscoop.com" 18 | "https://technode.com","./data/technode.com" 19 | "https://www.pymnts.com","./data/www.pymnts.com" 20 | "https://www.tripwire.com/state-of-security","./data/tripwire.com" 21 | "https://www.scmagazine.com","./data/www.scmagazine.com" 22 | "https://arstechnica.com","./data/arstechnica.com" 23 | "https://www.rd.com","./data/rd.com" 24 | "https://blog.mozilla.org","./data/blog.mozilla.org" 25 | "https://pulse.target.com","./data/pulse.target.com" 26 | "https://hackaday.com","./data/hackaday.com" 27 | "http://blog.ecocn.org","./data/blog.ecocn.org" 28 | "https://time.com","./data/time.com" 29 | "https://www.itworldcanada.com/","./data/www.itworldcanada.com" 30 | "https://www.sfexaminer.com","./data/sfexaminer.com" 31 | "https://cybersecuritynews.com","./data/cybersecuritynews.com" 32 | "https://cybersecurityventures.com","./data/cybersecurityventures.com" 33 | "https://www.innovationnewsnetwork.com","./data/innovationnewsnetwork.com" 34 | "https://www.telos.com","./data/telos.com" 35 | 36 | -------------------------------------------------------------------------------- /legacy/scripts/cuphk.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.cup.com.hk" 5 | output_dir = os.path.join('.', 'data', 'cup.com.hk') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__cfduid=d0c4282dcdb115dbcd08edfba7e0e8e721560849079', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/cybersecuritynews.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://cybersecuritynews.co.uk/" 5 | output_dir = os.path.join('.', 'data', 'cybersecuritynews.co.uk') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__cfduid=d168be0b4533032d20c2aa278ecb62fe31541147413; check=true; _ga=GA1.2.130067401.1541147420; _gid=GA1.2.1176639717.1541147420; bd112=Xc5BCsIwEAXQu2ThShICEbES3Lj1BEYkbadNoE3CZGoV8e622enqD58H869vNuHAKuaIUmWEEfM889pTCx2EFpA3cTQiewIjLtYHI3oIgHY4%2B97TEpbsKWkjNnZMR9KrvLs4QrI9lK4dNIRyOe0iqQwNx%2FrJ48P9PApAReVVdREXN6Gn1%2F%2BaggBRsy0jP0KmpWCV3Ckp1V7Jw%2Bf2BQ%3D%3D; AMCVS_0E920C0F53DA9E9B0A490D45%40AdobeOrg=1; AMCV_0E920C0F53DA9E9B0A490D45%40AdobeOrg=2035320058%7CMCIDTS%7C17838%7CMCMID%7C26407325438182498473478927876586191970%7CMCAAMLH-1541752221%7C11%7CMCAAMB-1541752221%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1541154621s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C3.3.0; mbox=session#8b43bd7f0847437182a44000418828d2#1541149284|PC#8b43bd7f0847437182a44000418828d2.22_35#1604392224; s_cc=true; aam_uuid=26426712967223226083480932539320192677; s_ht=1541147427592; s_hc=1%7C0%7C0%7C0%7C0; s_ppvl=h4s%253Ahome%2C29%2C29%2C1009%2C1920%2C1009%2C1920%2C1080%2C1%2CP; s_ppn=h4s%3Ahome; s_ppv=h4s%253Ahome%2C84%2C29%2C2946%2C1920%2C1009%2C1920%2C1080%2C1%2CP', 11 | 'Connection':'keep-alive', 12 | 'Host': 'cybersecuritynews.co.uk', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/cybersecurityreview.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.cybersecurity-review.com" 5 | output_dir = os.path.join('.', 'data', 'cybersecurity-review.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.910557633.1540266945; _gid=GA1.2.1879377337.1541147383', 11 | 'Host': 'www.cybersecurity-review.com', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/cyberthreat.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "http://cyberthreat.blog" 5 | output_dir = os.path.join('.', 'data', 'cyberthreat.blog') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Connection':'keep-alive', 11 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 12 | 'X-Requested-With':'XMLHttpRequest' 13 | } 14 | 15 | if __name__ == "__main__": 16 | from legacy import common_crawl 17 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 18 | common_crawl(wpc) 19 | -------------------------------------------------------------------------------- /legacy/scripts/helpnetsecurity.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.helpnetsecurity.com" 5 | output_dir = os.path.join('.', 'data', 'helpnetsecurity.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.910557633.1540266945; _gid=GA1.2.1879377337.1541147383', 11 | 'Host': 'www.helpnetsecurity.com', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/hotforsecurity.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://hotforsecurity.bitdefender.com" 5 | output_dir = os.path.join('.', 'data', 'hotforsecurity.bitdefender.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__cfduid=d168be0b4533032d20c2aa278ecb62fe31541147413; check=true; _ga=GA1.2.130067401.1541147420; _gid=GA1.2.1176639717.1541147420; bd112=Xc5BCsIwEAXQu2ThShICEbES3Lj1BEYkbadNoE3CZGoV8e622enqD58H869vNuHAKuaIUmWEEfM889pTCx2EFpA3cTQiewIjLtYHI3oIgHY4%2B97TEpbsKWkjNnZMR9KrvLs4QrI9lK4dNIRyOe0iqQwNx%2FrJ48P9PApAReVVdREXN6Gn1%2F%2BaggBRsy0jP0KmpWCV3Ckp1V7Jw%2Bf2BQ%3D%3D; AMCVS_0E920C0F53DA9E9B0A490D45%40AdobeOrg=1; AMCV_0E920C0F53DA9E9B0A490D45%40AdobeOrg=2035320058%7CMCIDTS%7C17838%7CMCMID%7C26407325438182498473478927876586191970%7CMCAAMLH-1541752221%7C11%7CMCAAMB-1541752221%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1541154621s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C3.3.0; mbox=session#8b43bd7f0847437182a44000418828d2#1541149284|PC#8b43bd7f0847437182a44000418828d2.22_35#1604392224; s_cc=true; aam_uuid=26426712967223226083480932539320192677; s_ht=1541147427592; s_hc=1%7C0%7C0%7C0%7C0; s_ppvl=h4s%253Ahome%2C29%2C29%2C1009%2C1920%2C1009%2C1920%2C1080%2C1%2CP; s_ppn=h4s%3Ahome; s_ppv=h4s%253Ahome%2C84%2C29%2C2946%2C1920%2C1009%2C1920%2C1080%2C1%2CP', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/informationage.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.information-age.com" 5 | output_dir = os.path.join('.', 'data', 'information-age.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'www.information-age.com', 11 | 'Connection':'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/infosecblog.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.infosecblog.org" 5 | output_dir = os.path.join('.', 'data', 'infosecblog.org') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__cfduid=dfc1c22a9bb42efd57b3c2231d0c43d2a1540799668; _ga=GA1.2.958394734.1540799670; _gid=GA1.2.1380839925.1540799670', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/itsecurity.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "http://itsecurity.co.uk" 5 | output_dir = os.path.join('.', 'data', 'itsecurity.co.uk') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'itsecurity.co.uk', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/lastwatchdog.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.lastwatchdog.com" 5 | output_dir = os.path.join('.', 'data', 'lastwatchdog') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'cookie':'__cfduid=dfea0ee9fe9b8670fdd8222535adb9ec21540527308; __unam=c02b1ec-166acddd608-734d3c03-1', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/liquidmatrix.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.liquidmatrix.org/blog" 5 | output_dir = os.path.join('.', 'data', 'liquidmatrix.org') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__atuvc=1%7C44; __atuvs=5bd6b92398768410000', 11 | 'Host': 'www.liquidmatrix.org', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/lookingglasscyber.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.lookingglasscyber.com" 5 | output_dir = os.path.join('.', 'data', 'lookingglasscyber') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Connection':'keep-alive', 11 | 'Cookie':'__cfduid=d8bd3b5ad23455ce8b26bcbb869ff80c31540451497', 12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/malwarebytes.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://blog.malwarebytes.com" 5 | output_dir = os.path.join('.', 'data', 'blog.malwarebytes.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1230427134.1541147423; _gid=GA1.2.1527966877.1541383870; _fbp=fb.1.1541468793370.261995265', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/martinoei.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://martinoei.com" 5 | output_dir = os.path.join('.', 'data', 'martinoei.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/riskiq.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.riskiq.com" 5 | output_dir = os.path.join('.', 'data', 'riskiq.com') 6 | 7 | headers = { 8 | 'Accept':'application/json, text/javascript, */*; q=0.01', 9 | 'Accept-Encoding':'gzip, deflate, sdch, br', 10 | 'Accept-Language':'zh-CN,zh;q=0.8', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/robertpenz.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://robert.penz.name" 5 | output_dir = os.path.join('.', 'data', 'robert.penz.name') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': 'pay_ent_smp=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCIsInZlciI6MX0.eyJ1cmxzIjpbIi8yMDAxLzAzL2hvbmV5cG90cy1iYWl0LWZvci10aGUtY3JhY2tlciJdLCJjbnQiOjEsIm1heCI6NCwiZXhwIjoyMDE4MTB9.SvGXeHRGxLna6rX9Kr9qTXrSU-pSbIkrDE_hQwQ60O4; CN_xid=d75239c8-659c-476c-9f5d-ea0547275e55; _sdsat_landing_page=https://www.wired.com/2001/03/honeypots-bait-for-the-cracker/|1540044766455; _sdsat_session_count=1; _sdsat_traffic_source=https://www.google.com.hk/; visitedCount_jwt=1; AMCVS_F7093025512D2B690A490D44%40AdobeOrg=1; CN_sp=fa58760d-4e72-4ce3-b443-2c9b42243770; CN_su=d19800c0-59da-4191-a88c-5c4148ebb692; CN_segments=; _ga=GA1.2.802886981.1540044768; fpcid=2456436582056134046_FP; v30=google.com.hk; v39=google.com.hk; s_cc=true; __gads=ID=f8c89cd159acd50f:T=1540044768:S=ALNI_MainR0wk-mflQoNeN_UO7dory-7gQ; aamconde=conde%3Dsv%3BCN%3D764985; aam_optimizely=aam%3D226821; aam_uuid=26426712967223226083480932539320192677; _sdsat_lt_pages_viewed=2; _sdsat_pages_viewed=2; _sdsat_AAM_UUID=26426712967223226083480932539320192677; CN_visits_m=1541001600572%26vn%3D2; CN_in_visit_m=true; sID=2955f7c3-91dd-4bc7-abc7-999968ecee3c; pID=1d3e3648-9926-466b-beec-6a2e98c8702c; AMCV_F7093025512D2B690A490D44%40AdobeOrg=1099438348%7CMCIDTS%7C17834%7CMCMID%7C26574736731012813853459705698242364028%7CMCAAMLH-1541404553%7C3%7CMCAAMB-1541404553%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1540806953s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C2.1.0; s_vnum_m=1541001600667%26vn%3D2; sinvisit_m=true; s_depth=1; timeSpent=1540799753187; s_ppn=https%3A%2F%2Fwww.wired.com%2Fcategory%2Fsecurity%2Fthreatlevel%2F; s_pct=Index; s_nr=1540799753188-Repeat; sailthru_pageviews=1; bounceClientVisit2825v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgO6kB0xAlgE4CmAJmQMYD2AtkUwIYK0DmLagE8iCOHR5haAN1pgiIADQhqMECAC+QA; _polar_tu=*_%22mgtn%22_@2Q_u_@_97f78f97-5c77-4716-b78f-a0dccc974ab0_Q_n_@3Q_s_@2Q_sc_@*_v_@1Q_a_@1+Q_ss_@_%22phcop7_Q_sl_@_%22phcop7_Q_sd_@*+Q_v_@nullQ_vc_@*_e_@0+Q_vs_@_%22phcop7_Q_vl_@_%22phcop7_Q_vd_@*+Q_vu_@_555fdf068442e929ddada46236b2ea5b_Q_vf_@_%22jnu0e179_+; _parsely_session={%22sid%22:2%2C%22surl%22:%22https://www.wired.com/category/threatlevel/%22%2C%22sref%22:%22%22%2C%22sts%22:1540799755083%2C%22slts%22:1540044768270}; _parsely_visitor={%22id%22:%22c609b887-dad6-414c-906e-f6a107dbb880%22%2C%22session_count%22:2%2C%22last_session_ts%22:1540799755083}; sailthru_content=e43720c11f5345e88d86bc1d5be31f74e2553d06f8d9ea3b9cb7420abe100f46; sailthru_visitor=1cb98baf-6809-4646-a0f8-aa82685e000a; AMP_TOKEN=%24NOT_FOUND; _gid=GA1.2.368334264.1540799760', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/scmagazine.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.scmagazine.com" 5 | output_dir = os.path.join('.', 'data', 'www.scmagazine.com') 6 | 7 | headers = { 8 | 'Accept':'application/json, text/javascript, */*; q=0.01', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Connection':'keep-alive', 11 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 12 | 'X-Requested-With':'XMLHttpRequest' 13 | } 14 | 15 | if __name__ == "__main__": 16 | from legacy import common_crawl 17 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 18 | common_crawl(wpc) 19 | -------------------------------------------------------------------------------- /legacy/scripts/securelist.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://securelist.com" 5 | output_dir = os.path.join('.', 'data', 'securelist.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=5) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/securingtomorrow.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.mcafee.com/blogs" 5 | output_dir = os.path.join('.', 'data', 'securingtomorrow.mcafee.com') 6 | 7 | headers = { 8 | 'Accept':'application/json, text/javascript, */*; q=0.01', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Connection':'keep-alive', 11 | 'cookie':'utag_main=_st:1540559125612$ses_id:1540558064662%3Bexp-session; PHPSESSID=99p4mht3963nnm4649bhdhcna1', 12 | 'Host': 'www.mcafee.com', 13 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/securityaffairs.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://securityaffairs.co/wordpress" 5 | output_dir = os.path.join('.', 'data', 'securityaffairs.co-wordpress') 6 | 7 | headers = { 8 | 'Accept':'application/json, text/javascript, */*; q=0.01', 9 | 'Accept-Encoding':'gzip, deflate, sdch, br', 10 | 'Accept-Language':'zh-CN,zh;q=0.8', 11 | 'Connection':'keep-alive', 12 | 'cookie':'__sharethis_cookie_test__=1; __unam=6f69f6a-166b1935d01-3a484fa3-3', 13 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/securityboulevard.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import improved_crawler 2 | import os 3 | 4 | url = "https://securityboulevard.com" 5 | output_dir = os.path.join('.', 'data', 'securityboulevard.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'en-US,en;q=0.9,zh-CN,zh;q=0.8', 10 | 'Cache-control': 'max-age=0', 11 | 'Cookie': 'timer=3; lastvisit=1618499885', 12 | 'Dnt': '1', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36', 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = improved_crawler.MultiThreadedCrawler(url, headers, output_dir, crawl_rate=20, retry_standoff=30, max_retries=5) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/securityledger.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://securityledger.com" 5 | output_dir = os.path.join('.', 'data', 'securityledger.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__cfduid=df69a10a216bf3a67efa57115a197bbda1540799737; __sharethis_cookie_test__=1; __unam=7639673-166bed22de8-519e7a93-1; __qca=P0-1129919385-1540799738659; _ga=GA1.2.963329934.1540799739; _gid=GA1.2.1250730364.1540799739', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/securityweekly.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://securityweekly.com" 5 | output_dir = os.path.join('.', 'data', 'securityweekly.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1656740477.1540263278; _gid=GA1.2.1979594887.1540799746', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/sensorstechforum.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://sensorstechforum.com" 5 | output_dir = os.path.join('.', 'data', 'sensorstechforum.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '__cfduid=d6e8d60c5ed6e3694722eda50a248c54a1541577018; PHPSESSID=824147b5f2c890a2dc93bfbaccf9e157; cookiescriptaccept=visit', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/socialengineer.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.social-engineer.org" 5 | output_dir = os.path.join('.', 'data', 'social-engineer.org') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': 'PHPSESSID=66j0js30oq2j43o59fa278pti1; wordpress_test_cookie=WP+Cookie+check; __unam=fea26eb-166bed13d3b-2f1cc5de-1; _ga=GA1.2.1491302686.1540799678; _gid=GA1.2.157321464.1540799678', 11 | 'Host': 'www.social-engineer.org', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/sucuri.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://blog.sucuri.net" 5 | output_dir = os.path.join('.', 'data', 'blog.sucuri.net') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1452495584.1541147436; IR_gbd=sucuri.net; _gid=GA1.2.959211258.1541383890; IR_3713=1541559256293%7C0%7C1541559256293; IR_PI=3df49081-c233-9d54-c728-932176615ccd%7C1541645656293', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/techcrunch.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://techcrunch.com" 5 | output_dir = os.path.join('.', 'data', 'techcrunch.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': 'GUCS=AZbgFVVR; BX=dh9q691dto2nb&b=3&s=pa; GUC=AQEBAQFb3VFcqEIeDwRg&s=AQAAAHrL6bS1&g=W9wK9Q; rxx=1igbwm2xq2h.1b7gr4g0&v=1; __pcvc={}; _parsely_session={%22sid%22:1%2C%22surl%22:%22https://techcrunch.com/tag/security/%22%2C%22sref%22:%22https://heimdalsecurity.com/blog/best-internet-security-blogs/%22%2C%22sts%22:1541147372572%2C%22slts%22:0}; _ga=GA1.2.1478942211.1541147373; _gid=GA1.2.641246751.1541147373; _fbp=fb.1.1541147372697.1423203867; _parsely_visitor={%22id%22:%22pid=ec7dcbdb884243adc40612940d2620db%22%2C%22session_count%22:1%2C%22last_session_ts%22:1541147372572}; __tbc=%7Bjzx%7DjGAToaZMxJYLoS7N4KRjDSIBQx5XEUgUJ2gvwnLO_mQfpRr8AZeO09jGk7-hTUyQLRad9Q5Lm3MJbSMChjRhiKNoGVtYYENGn5Vzh_MxqnGS8ho0EJnccNxVf1vgcsspOzaeIILFhlSTw5lulLLvTw; __pat=-14400000; __pvi=%7B%22id%22%3A%22v-2018-11-02-16-29-33-940-6TjxfdUCVZ3f1YuQ-2f3e0384dd8f3212b5d9c2020699090e%22%2C%22domain%22%3A%22.techcrunch.com%22%2C%22time%22%3A1541147374314%7D; xbc=%7Bjzx%7D_e5BhmMRjtsjUjGbrtX0c7h7R3xM4VwABoRuGqnXs1Ch_5rnoQguNFJyyQn8ud-iL8IqeSU80X8vYQUgLUhfHjuT7rlXqV_haFtcBrb8yuIRISMjwJMKlsrJHHm5uxsQSNVe4coFP2tX0siiAgGZ0F02N2xHWSErhk02CjXpVfgMydpMnNyxptMGZ-xXMB5A; __adblocker=false', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/techlear.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.techlear.com/blog" 5 | output_dir = os.path.join('.', 'data', 'techlear.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'www.techlear.com', 11 | 'Connection':'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/threatpost.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://threatpost.com/" 5 | output_dir = os.path.join('.', 'data', 'threatpost') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.908572152.1540264737; __gads=ID=b803df93913bd567:T=1540264741:S=ALNI_MaUpeM35RC42f9YZMJE4x8eKpPo-Q; _gid=GA1.2.1096157912.1540537307; _gat_UA-35676203-21=1; _fbp=fb.1.1540537307445.696691226; _gat_gtag_UA_109681207_2=1', 11 | 'Host': 'threatpost.com', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/tripwire.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.tripwire.com/state-of-security" 5 | output_dir = os.path.join('.', 'data', 'tripwire.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': 'ASP.NET_SessionID=rno4pepppbkdh2f1bn5nc2kj; SC_ANALYTICS_GLOBAL_COOKIE=f9c357b4f7124f7890bd980c59b97db4|False; active=yes; Set_Me=3415655320.1.1933537720.2330721216; SnapABugRef=https%3A%2F%2Fwww.tripwire.com%2F%20; SnapABugHistory=1#; SnapABugVisit=1#1541675065', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | if __name__ == "__main__": 17 | from legacy import common_crawl 18 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=25) 19 | common_crawl(wpc) 20 | -------------------------------------------------------------------------------- /legacy/scripts/trustedsec.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.trustedsec.com" 5 | output_dir = os.path.join('.', 'data', 'trustedsec.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Host': 'www.trustedsec.com', 11 | 'Connection':'keep-alive', 12 | 'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy/scripts/unwire.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://unwire.pro" 5 | output_dir = os.path.join('.', 'data', 'unwire.pro') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Connection':'keep-alive', 11 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 12 | } 13 | 14 | if __name__ == "__main__": 15 | from legacy import common_crawl 16 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=5) 17 | common_crawl(wpc) 18 | -------------------------------------------------------------------------------- /legacy/scripts/vipre.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.vipre.com" 5 | output_dir = os.path.join('.', 'data', 'www.vipre.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': '_ga=GA1.2.1644535100.1541147443; __adroll_fpc=3a119bf44db7ee75d21bdc4bd8006d25; __idcontext=eyJjb29raWVJRCI6IjZFM0I3VUNSNDVOWVZHT1o3WTJGSDZOT1dXRUtFMkM0TlhKNUdBVDdIVkdRPT09PSIsImRldmljZUlEIjoiNkUzQjdVQ1I0QjNKRlg2WFZZSVdMNjcyU1M0SllNU0dKTEc1T0FDR0c0RUE9PT09IiwiaXYiOiI1SVdKNlpMUlBKN0lNNlcyVkpWMjZQRzJKST09PT09PSIsInYiOjF9; _gid=GA1.2.1952222110.1541383857; _gcl_au=1.1.535369970.1541532633; __ar_v4=R5EX2LAD7FAOVD6PWNPH6O%3A20181102%3A10%7CN7AGIAEPRZDM5FMAGV2QUY%3A20181102%3A10%7C53FLNYE57ZE4ZDAHOYANNY%3A20181102%3A10', 11 | 'Host': 'www.vipre.com', 12 | 'Connection':'keep-alive', 13 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 14 | 'X-Requested-With':'XMLHttpRequest' 15 | } 16 | 17 | 18 | if __name__ == "__main__": 19 | from legacy import common_crawl 20 | wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100, verify_ssl=False) 21 | common_crawl(wpc) 22 | -------------------------------------------------------------------------------- /legacy/scripts/wired.py: -------------------------------------------------------------------------------- 1 | from legacy.crawler import crawler 2 | import os 3 | 4 | url = "https://www.wired.com" 5 | output_dir = os.path.join('.', 'data', 'wired.com') 6 | headers = { 7 | 'Accept':'application/json, text/javascript, */*; q=0.01', 8 | 'Accept-Encoding':'*', 9 | 'Accept-Language':'zh-CN,zh;q=0.8', 10 | 'Cookie': 'pay_ent_smp=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCIsInZlciI6MX0.eyJ1cmxzIjpbIi8yMDAxLzAzL2hvbmV5cG90cy1iYWl0LWZvci10aGUtY3JhY2tlciJdLCJjbnQiOjEsIm1heCI6NCwiZXhwIjoyMDE4MTB9.SvGXeHRGxLna6rX9Kr9qTXrSU-pSbIkrDE_hQwQ60O4; CN_xid=d75239c8-659c-476c-9f5d-ea0547275e55; _sdsat_landing_page=https://www.wired.com/2001/03/honeypots-bait-for-the-cracker/|1540044766455; _sdsat_session_count=1; _sdsat_traffic_source=https://www.google.com.hk/; visitedCount_jwt=1; AMCVS_F7093025512D2B690A490D44%40AdobeOrg=1; CN_sp=fa58760d-4e72-4ce3-b443-2c9b42243770; CN_su=d19800c0-59da-4191-a88c-5c4148ebb692; CN_segments=; _ga=GA1.2.802886981.1540044768; fpcid=2456436582056134046_FP; v30=google.com.hk; v39=google.com.hk; s_cc=true; __gads=ID=f8c89cd159acd50f:T=1540044768:S=ALNI_MainR0wk-mflQoNeN_UO7dory-7gQ; aamconde=conde%3Dsv%3BCN%3D764985; aam_optimizely=aam%3D226821; aam_uuid=26426712967223226083480932539320192677; _sdsat_lt_pages_viewed=2; _sdsat_pages_viewed=2; _sdsat_AAM_UUID=26426712967223226083480932539320192677; CN_visits_m=1541001600572%26vn%3D2; CN_in_visit_m=true; sID=2955f7c3-91dd-4bc7-abc7-999968ecee3c; pID=1d3e3648-9926-466b-beec-6a2e98c8702c; AMCV_F7093025512D2B690A490D44%40AdobeOrg=1099438348%7CMCIDTS%7C17834%7CMCMID%7C26574736731012813853459705698242364028%7CMCAAMLH-1541404553%7C3%7CMCAAMB-1541404553%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1540806953s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C2.1.0; s_vnum_m=1541001600667%26vn%3D2; sinvisit_m=true; s_depth=1; timeSpent=1540799753187; s_ppn=https%3A%2F%2Fwww.wired.com%2Fcategory%2Fsecurity%2Fthreatlevel%2F; s_pct=Index; s_nr=1540799753188-Repeat; sailthru_pageviews=1; bounceClientVisit2825v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgO6kB0xAlgE4CmAJmQMYD2AtkUwIYK0DmLagE8iCOHR5haAN1pgiIADQhqMECAC+QA; _polar_tu=*_%22mgtn%22_@2Q_u_@_97f78f97-5c77-4716-b78f-a0dccc974ab0_Q_n_@3Q_s_@2Q_sc_@*_v_@1Q_a_@1+Q_ss_@_%22phcop7_Q_sl_@_%22phcop7_Q_sd_@*+Q_v_@nullQ_vc_@*_e_@0+Q_vs_@_%22phcop7_Q_vl_@_%22phcop7_Q_vd_@*+Q_vu_@_555fdf068442e929ddada46236b2ea5b_Q_vf_@_%22jnu0e179_+; _parsely_session={%22sid%22:2%2C%22surl%22:%22https://www.wired.com/category/threatlevel/%22%2C%22sref%22:%22%22%2C%22sts%22:1540799755083%2C%22slts%22:1540044768270}; _parsely_visitor={%22id%22:%22c609b887-dad6-414c-906e-f6a107dbb880%22%2C%22session_count%22:2%2C%22last_session_ts%22:1540799755083}; sailthru_content=e43720c11f5345e88d86bc1d5be31f74e2553d06f8d9ea3b9cb7420abe100f46; sailthru_visitor=1cb98baf-6809-4646-a0f8-aa82685e000a; AMP_TOKEN=%24NOT_FOUND; _gid=GA1.2.368334264.1540799760', 11 | 'Connection':'keep-alive', 12 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36', 13 | 'X-Requested-With':'XMLHttpRequest' 14 | } 15 | 16 | 17 | if __name__ == "__main__": 18 | from legacy import common_crawl 19 | wpc = crawler.WordPressCrawler(url, headers, output_dir) 20 | common_crawl(wpc) 21 | -------------------------------------------------------------------------------- /legacy_crawl_all.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from multiprocessing import Pool 4 | import argparse 5 | 6 | from legacy_main import crawl 7 | 8 | threads = 10 9 | crawl_rate = 5 10 | 11 | 12 | def multiple_crawl(jobs): 13 | flattened_jobs = map(lambda x: (x['url'], x['output_dir'], crawl_rate), jobs) 14 | with Pool(threads) as p: 15 | p.starmap(crawl, flattened_jobs) 16 | 17 | 18 | def parse(csv_path: str): 19 | if os.path.exists(csv_path): 20 | with open(csv_path, 'r') as f: 21 | csvreader = csv.DictReader(f, delimiter=',', quotechar='"') 22 | return list(csvreader) 23 | else: 24 | raise FileNotFoundError() 25 | 26 | 27 | def main(csv_file): 28 | jobs = parse(csv_file) 29 | multiple_crawl(jobs) 30 | 31 | 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--csvfile', default="./legacy/scripts/crawled_list.csv") 35 | args = parser.parse_args() 36 | main(args.csvfile) 37 | -------------------------------------------------------------------------------- /legacy_main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from legacy.crawler import crawler 3 | from legacy import common_header, common_crawl 4 | 5 | 6 | def crawl(url, output_dir, crawl_rate=25): 7 | wpc = crawler.WordPressCrawler(url, common_header, output_dir, crawl_rate) 8 | common_crawl(wpc) 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("url") 14 | parser.add_argument("output_dir") 15 | parser.add_argument("--crawl_rate", default=25) 16 | args = parser.parse_args() 17 | if args.crawl_rate: 18 | crawl(args.url, args.output_dir, int(args.crawl_rate)) 19 | else: 20 | crawl(args.url, args.output_dir) 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pymongo==4.7.1 2 | requests==2.31.0 -------------------------------------------------------------------------------- /wpscraper/connector.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | import hashlib 4 | import json 5 | 6 | from pymongo import MongoClient 7 | 8 | from wpscraper.document import Document 9 | 10 | 11 | def create_directory(directory): 12 | parent_directory = os.path.dirname(os.path.realpath(directory)) 13 | if not os.path.exists(parent_directory): 14 | create_directory(parent_directory) 15 | if not os.path.exists(directory): 16 | os.makedirs(directory) 17 | 18 | 19 | class Connector(ABC): 20 | @abstractmethod 21 | def process_document(self, document: Document, *args, **kwargs): 22 | pass 23 | 24 | 25 | class FileSystemConnector(Connector): 26 | def __init__(self, folder: str, save_as_individual_files: bool = False): 27 | self.folder = folder 28 | self.save_as_individual_files = save_as_individual_files 29 | create_directory(self.folder) 30 | 31 | def process_document(self, document: Document, resource: str): 32 | json_string = json.dumps(document.data) 33 | if self.save_as_individual_files: 34 | self._generate_individual_document(json_string) 35 | else: 36 | self._append_resource_document(resource, json_string) 37 | 38 | def _generate_individual_document(self, json_string: str): 39 | filename = hashlib.sha256(json_string.encode('utf-8')).hexdigest() 40 | file_to_write = os.path.join(self.folder, "{}.json".format(filename)) 41 | with open(file_to_write, 'w') as f: 42 | f.write(json_string) 43 | 44 | def _append_resource_document(self, resource: str, json_string: str): 45 | file_to_write = os.path.join(self.folder, "{}.json".format(resource)) 46 | with open(file_to_write, 'a') as f: 47 | f.write(json_string) 48 | f.write("\n") # add linebreak at the end 49 | 50 | 51 | class MongoDBConnector(Connector): 52 | def __init__(self, db_host: str, db_port: int, db_database: str, db_collection: str, 53 | username: str, password: str, auth_source: str = "admin", auth_mechanism: str = "SCRAM-SHA-256"): 54 | self.db_host = db_host 55 | self.db_port = db_port 56 | self.db_database = db_database 57 | self.db_collection = db_collection 58 | self.username = username 59 | self.password = password 60 | self.auth_source = auth_source 61 | self.auth_mechanism = auth_mechanism 62 | self.client = MongoClient(host=self.db_host, port=self.db_port, username=self.username, password=self.password, 63 | authSource=self.auth_source, authMechanism=self.auth_mechanism) 64 | 65 | def process_document(self, document: Document, resource: str): 66 | doc_id = self.client[self.db_database][self.db_collection].insert_one(document=document.data).inserted_id 67 | if not doc_id: 68 | raise ConnectionError("Couldn't insert document into MongoDB.") 69 | -------------------------------------------------------------------------------- /wpscraper/crawler.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from abc import ABC, abstractmethod 4 | 5 | import requests 6 | 7 | from wpscraper.headers import Headers 8 | 9 | 10 | class Crawler(ABC): 11 | def __init__(self, url, headers: Headers, verify_ssl: bool, timeout: int, max_retries: int): 12 | self.api_path = url + '/wp-json/wp/v2/' 13 | self.headers = headers 14 | self.verify_ssl = verify_ssl 15 | self.timeout = timeout 16 | self.max_retries = max_retries 17 | 18 | @abstractmethod 19 | def crawl(self, resource: str, *args, **kwargs): 20 | pass 21 | 22 | 23 | class SimpleRequestsCrawler(Crawler): 24 | def __init__(self, url, headers: Headers = None, crawl_rate: int = 25, verify_ssl: bool = True, timeout: int = 30, 25 | max_retries: int = 5, constant_retry_standoff: int = 30): 26 | super().__init__(url=url, headers=headers, verify_ssl=verify_ssl, timeout=timeout, max_retries=max_retries) 27 | self.crawled_resource_count = {} 28 | self.crawl_rate = crawl_rate 29 | self.constant_retry_standoff = constant_retry_standoff 30 | 31 | def crawl(self, resource: str): 32 | documents = [] 33 | if not resource in self.crawled_resource_count.keys(): 34 | self.crawled_resource_count[resource] = 1 35 | objs = self._get_json_response( 36 | self.api_path+resource+'?per_page={}&page={}'.format(self.crawl_rate, self.crawled_resource_count[resource]) 37 | ) 38 | if objs and isinstance(objs, list): 39 | documents = objs 40 | self.crawled_resource_count[resource] += 1 41 | else: 42 | print("No documents are crawled.") 43 | return documents 44 | 45 | def _get_json_response(self, url): 46 | attempt_count = 1 47 | while attempt_count < self.max_retries: 48 | try: 49 | response = requests.get(url, headers=self.headers.headers, timeout=self.timeout, verify=self.verify_ssl) 50 | print('subpath: {}'.format(url)) 51 | print('response code: {}'.format(response.status_code)) 52 | print('response head: {}'.format(response.text[:300])) 53 | if response.status_code == 200 or response.status_code == 400: 54 | return json.loads(next(response.iter_lines())) 55 | else: 56 | print("status code returned {}".format(response.status_code)) 57 | except requests.Timeout: 58 | print("Timed out.") 59 | except Exception as e: 60 | print("Exception occurred: {}".format(e)) 61 | attempt_count += 1 62 | print("waiting for {} seconds...".format(self.constant_retry_standoff)) 63 | time.sleep(self.constant_retry_standoff) 64 | print("retrying... (attempt {} of {})".format(attempt_count, self.max_retries)) 65 | print("max no. of retries reached. exiting...") 66 | return None 67 | 68 | def get_crawled_stat(self): 69 | return self.crawled_resource_count 70 | 71 | -------------------------------------------------------------------------------- /wpscraper/document.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from abc import ABC, abstractmethod 3 | 4 | 5 | class Document(ABC): 6 | def __init__(self, raw_data: Any): 7 | self.raw_data = raw_data 8 | self.data = None 9 | 10 | @abstractmethod 11 | def process_raw_data(self, *args, **kwargs): 12 | pass 13 | 14 | def __repr__(self): 15 | return self.data 16 | 17 | 18 | class JSONDocument(Document): 19 | # receive and process raw JSON document from WP-JSON API 20 | def __init__(self, raw_data: dict, **kwargs): 21 | super().__init__(raw_data) 22 | self.process_raw_data(kwargs) 23 | 24 | def process_raw_data(self, kwargs: dict): 25 | self.data = {"data": self.raw_data} 26 | self.data.update(kwargs) 27 | 28 | 29 | class RawDocument(Document): 30 | def __init__(self, raw_data: dict): 31 | super().__init__(raw_data) 32 | self.process_raw_data() 33 | 34 | def process_raw_data(self): 35 | self.data = self.raw_data 36 | -------------------------------------------------------------------------------- /wpscraper/headers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Headers(ABC): 5 | @abstractmethod 6 | def __init__(self): 7 | self.headers = None 8 | 9 | def __repr__(self): 10 | return str(self.headers) 11 | 12 | 13 | class DefaultHeaders(Headers): 14 | def __init__(self, domain): 15 | super().__init__() 16 | self.headers = { 17 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 18 | 'Accept-Encoding': '*', 19 | 'Accept-Language': 'zh-CN,zh;q=0.8', 20 | 'Host': domain, 21 | 'Connection': 'keep-alive', 22 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', 23 | 'X-Requested-With': 'XMLHttpRequest' 24 | } -------------------------------------------------------------------------------- /wpscraper/session.py: -------------------------------------------------------------------------------- 1 | from uuid import uuid4 2 | from typing import List 3 | import datetime 4 | 5 | from urllib.parse import urlparse 6 | 7 | from wpscraper.connector import Connector, FileSystemConnector 8 | from wpscraper.crawler import Crawler, SimpleRequestsCrawler 9 | from wpscraper.headers import DefaultHeaders 10 | from wpscraper.document import JSONDocument 11 | 12 | VALID_PATHS = [ 13 | 'posts', 14 | 'tags', 15 | 'categories' 16 | ] 17 | 18 | 19 | def validate_paths(resources: List[str]): 20 | for path in resources: 21 | if path not in VALID_PATHS: 22 | raise NameError('path "{}" is not a valid path.'.format(path)) 23 | return resources 24 | 25 | 26 | class CrawlSession: 27 | def __init__(self, url: str, resources: List[str], session_id: str = str(uuid4())): 28 | self.session_id = session_id 29 | self.url = url 30 | self.domain = urlparse(self.url).netloc 31 | self.resources = validate_paths(resources) 32 | self.crawler = None 33 | self.connectors = [] 34 | 35 | def set_crawler(self, crawler: Crawler): 36 | self.crawler = crawler 37 | 38 | def add_connector(self, connector: Connector): 39 | self.connectors.append(connector) 40 | 41 | def set_connectors(self, connectors: List[Connector]): 42 | self.connectors = connectors 43 | 44 | def execute(self): 45 | if not (self.crawler and self.connectors): 46 | raise AssertionError("No crawler and/or connector is specified.") 47 | for resource in self.resources: 48 | while True: 49 | raw_documents = self.crawler.crawl(resource=resource) 50 | if not raw_documents: 51 | break 52 | current_timestamp = datetime.datetime.utcnow().isoformat() 53 | documents = [JSONDocument(document, resource_type=resource, session_id=self.session_id, 54 | crawledtime=current_timestamp) for document in raw_documents] 55 | for document in documents: 56 | for connector in self.connectors: 57 | connector.process_document(resource=resource, document=document) 58 | 59 | 60 | class DefaultCrawlSession(CrawlSession): 61 | def __init__(self, url: str, session_id: str = str(uuid4())): 62 | resources = ['posts', 'tags', 'categories'] 63 | super().__init__(url, resources=resources, session_id=session_id) 64 | headers = DefaultHeaders(self.domain) 65 | self.crawler = SimpleRequestsCrawler(url=self.url, headers=headers) 66 | self.connectors = [FileSystemConnector(folder='./data/{}'.format(self.domain), save_as_individual_files=True)] 67 | -------------------------------------------------------------------------------- /wpscraper/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | # unused function to remove characters before JSON response, might be useful later 5 | def remove_leading_scripts(response_text): 6 | cleaned_text = re.search(r'(\[.*)', response_text) 7 | return cleaned_text.group(1) 8 | --------------------------------------------------------------------------------