├── .github
    └── dependabot.yml
├── .gitignore
├── LICENSE
├── README.md
├── crawl.py
├── file2mongo.py
├── legacy
    ├── __init__.py
    ├── crawler
    │   ├── crawler.py
    │   ├── improved_crawler.py
    │   └── utils.py
    └── scripts
    │   ├── __init__.py
    │   ├── activecyber.py
    │   ├── andrewhay.py
    │   ├── australiancybersecuritymagazine.py
    │   ├── betanews.py
    │   ├── bromium.py
    │   ├── cfobase.py
    │   ├── chinainternetwatch.py
    │   ├── comparitech.py
    │   ├── crawled_list.csv
    │   ├── cuphk.py
    │   ├── cybersecuritynews.py
    │   ├── cybersecurityreview.py
    │   ├── cyberthreat.py
    │   ├── helpnetsecurity.py
    │   ├── hotforsecurity.py
    │   ├── informationage.py
    │   ├── infosecblog.py
    │   ├── itsecurity.py
    │   ├── lastwatchdog.py
    │   ├── liquidmatrix.py
    │   ├── lookingglasscyber.py
    │   ├── malwarebytes.py
    │   ├── martinoei.py
    │   ├── riskiq.py
    │   ├── robertpenz.py
    │   ├── scmagazine.py
    │   ├── securelist.py
    │   ├── securingtomorrow.py
    │   ├── securityaffairs.py
    │   ├── securityboulevard.py
    │   ├── securityledger.py
    │   ├── securityweekly.py
    │   ├── sensorstechforum.py
    │   ├── socialengineer.py
    │   ├── sucuri.py
    │   ├── techcrunch.py
    │   ├── techlear.py
    │   ├── threatpost.py
    │   ├── tripwire.py
    │   ├── trustedsec.py
    │   ├── unwire.py
    │   ├── vipre.py
    │   └── wired.py
├── legacy_crawl_all.py
├── legacy_main.py
├── requirements.txt
└── wpscraper
    ├── connector.py
    ├── crawler.py
    ├── document.py
    ├── headers.py
    ├── session.py
    └── utils.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: pip
4 |   directory: "/"
5 |   schedule:
6 |     interval: daily
7 |     time: "21:00"
8 |   open-pull-requests-limit: 10
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | venv/
3 | __pycache__/
4 | *.ipynb
5 | .idea/
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Orix Au Yeung
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # wordpress-scraper
 2 | 
 3 | ## Description
 4 | 
 5 | Simple, easy-to-use scraper to scrape data from WordPress JSON API
 6 | 
 7 | ### Features
 8 | - Support storing crawled documents as MongoDB documents / JSON files
 9 | - Auto retry upon errors
10 | 
11 | ## Requirements
12 | 
13 |  - Python 3.7+
14 | 
15 | ## Installation
16 | 
17 | ```bash
18 | pip install -r requirements.txt
19 | ```
20 | 
21 | ## How to use
22 | 
23 | ### Basic
24 | 
25 | Just run `crawl.py` with the sites URL supplied:
26 | 
27 | ```bash
28 | python3 crawl.py https://your.website.here
29 | ```
30 | 
31 | This will crawl the site using `DefaultCrawlSession`, which attempts to crawl all `posts`, `categories` & `tags` from the site.
32 | 
33 | The crawled JSON files will be stored in the directory `./data/<domain-name>`.
34 | 
35 | Most of the time, This will suffice when scraping sites that are:
36 | 1. not required to sign in 
37 | 2. JSON API paths not blocked
38 | 
39 | 
40 | ### Advanced
41 | For advanced usage and customizations you may want to look at `wpscraper/session.py` for actual crawling procedures, and make your own `CrawlSession`.
42 | 
43 | ## Upcoming Features
44 | 
45 | - [x] Rewrite/Refactor
46 | - [x] MongoDB Connector
47 | - [ ] Async session
48 | - [ ] Authentication Module
49 | - [ ] Cloudflare circumvention
50 | - [ ] Configurable retry policies
51 | - [ ] Full WPv2 API resources support
52 | 


--------------------------------------------------------------------------------
/crawl.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from wpscraper.session import DefaultCrawlSession
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("url")
 9 |     args = parser.parse_args()
10 | 
11 |     session = DefaultCrawlSession(args.url)
12 |     session.execute()
13 | 


--------------------------------------------------------------------------------
/file2mongo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | from wpscraper.connector import MongoDBConnector
 6 | from wpscraper.document import RawDocument
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("filepath", type=str)
10 | parser.add_argument("db_host", type=str)
11 | parser.add_argument("db_port", type=int)
12 | parser.add_argument("db_database", type=str)
13 | parser.add_argument("db_collection", type=str)
14 | parser.add_argument("username", type=str)
15 | parser.add_argument("password", type=str)
16 | 
17 | 
18 | def files_to_mongodb(filepath: str, db_host: str, db_port: int, db_database: str, db_collection: str,
19 |                  username: str, password: str, **kwargs):
20 |     files = [os.path.join(filepath, x) for x in os.listdir(filepath) if x.split(".")[-1].lower() == "json"]
21 |     c = MongoDBConnector(db_host=db_host, db_port=db_port, db_database=db_database, db_collection=db_collection,
22 |                          username=username, password=password, **kwargs)
23 |     for file in files:
24 |         with open(file, 'r') as f:
25 |             json_obj = json.load(f)
26 |         doc = RawDocument(raw_data=json_obj)
27 |         resource = doc.data['resource_type']
28 |         c.process_document(doc, resource)
29 |         print("{} uploaded.".format(file))
30 |     print("done.")
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     args = parser.parse_args()
35 |     files_to_mongodb(filepath=args.filepath, db_host=args.db_host, db_port=args.db_port, db_database=args.db_database,
36 |                      db_collection=args.db_collection, username=args.username, password=args.password)
37 | 


--------------------------------------------------------------------------------
/legacy/__init__.py:
--------------------------------------------------------------------------------
 1 | common_header = {
 2 |     'Accept': 'application/json, text/javascript, */*; q=0.01',
 3 |     'Accept-Encoding': '*',
 4 |     'Accept-Language': 'zh-CN,zh;q=0.8',
 5 |     'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371',
 6 |     'Connection': 'keep-alive',
 7 |     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
 8 |     'X-Requested-With': 'XMLHttpRequest'
 9 | }
10 | 
11 | 
12 | def common_crawl(wpc):
13 |     wpc.get_tags()
14 |     wpc.get_categories()
15 |     wpc.get_posts()
16 | 


--------------------------------------------------------------------------------
/legacy/crawler/crawler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import os
 4 | 
 5 | import requests
 6 | 
 7 | from legacy.crawler import utils
 8 | 
 9 | 
10 | class WordPressCrawler:
11 |     def __init__(self, url, headers, output_dir, crawl_rate=25, verify_ssl=True, timeout=30, retry_standoff=30,
12 |                  max_retries=5):
13 |         self.api = url + '/wp-json/wp/v2'
14 |         self.headers = headers
15 |         self.crawl_rate = crawl_rate
16 |         self.verify_ssl = verify_ssl
17 |         self.output_dir = output_dir
18 |         self.timeout = timeout
19 |         self.retry_standoff = retry_standoff
20 |         self.max_retries = max_retries
21 |         # self.session = HTMLSession()
22 | 
23 |     def _get(self, path, output_file):
24 |         json_output = self._crawl_jsons(self.api + path)
25 |         if output_file:
26 |             utils.dump_json(json_output, output_file)
27 |         return json_output
28 | 
29 |     def get_categories(self, output_file=None):
30 |         if not output_file:
31 |             output_file = os.path.join(self.output_dir, "cats.json")
32 |         return self._get('/categories', output_file)
33 | 
34 |     def get_tags(self, output_file=None):
35 |         if not output_file:
36 |             output_file = os.path.join(self.output_dir, "tags.json")
37 |         return self._get('/tags', output_file)
38 | 
39 |     def get_posts(self, output_file=None):
40 |         if not output_file:
41 |             output_file = os.path.join(self.output_dir, "posts.json")
42 |         return self._get('/posts', output_file)
43 | 
44 |     def _isjsonarray(self, json):
45 |         return json and isinstance(json, list)
46 | 
47 |     def set_output_dir(self, output_dir):
48 |         self.output_dir = output_dir
49 | 
50 |     def _crawl_jsons(self, url):
51 |         output = []
52 |         i = 1
53 |         while True:
54 |             json_repsonse = self._get_json_response('{}?per_page={}&page={}'.format(url, self.crawl_rate, i))
55 |             if self._isjsonarray(json_repsonse):
56 |                 output += json_repsonse
57 |                 i += 1
58 |             else:
59 |                 break
60 |         return output
61 | 
62 |     def _get_json_response(self, url):
63 |         retries = 1
64 |         while retries <= self.max_retries:
65 |             print("attempt #{} of {} - {}".format(retries, self.max_retries, url))
66 |             try:
67 |                 response = requests.get(url, headers=self.headers, timeout=self.timeout,
68 |                                         verify=self.verify_ssl)  # 30 seconds
69 |                 print('response code: {}'.format(response.status_code))
70 |                 print('response head: {}'.format(response.text[:300]))
71 |                 # some API return valid response despite code 400 (weird I know)
72 |                 if response.status_code <= 400:
73 |                     # return json.loads(response.iter_lines())
74 |                     return json.loads(response.content.decode('utf-8').strip('\n').strip(' '))
75 |                 else:
76 |                     print("status code returned {}".format(response.status_code))
77 |             except requests.Timeout:
78 |                 print("Timed out.")
79 |             except Exception as e:
80 |                 print("Exception occurred:\nError Type: {}\nDetails: {}".format(type(e), str(e)))
81 |             retries += 1
82 |             print("waiting for {} seconds...".format(self.retry_standoff))
83 |             time.sleep(self.retry_standoff)
84 |         print("max no. of retries reached. exiting...")
85 |         return None
86 | 


--------------------------------------------------------------------------------
/legacy/crawler/improved_crawler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import os
 4 | from multiprocessing import Pool
 5 | 
 6 | import requests
 7 | 
 8 | from legacy.crawler import utils
 9 | 
10 | 
11 | class MultiThreadedCrawler:
12 |     def __init__(self, url, headers, output_dir, crawl_rate=25, verify_ssl=True, timeout=30, retry_standoff=30,
13 |                  max_retries=5):
14 |         self.api = url + '/wp-json/wp/v2'
15 |         self.headers = headers
16 |         self.crawl_rate = crawl_rate
17 |         self.verify_ssl = verify_ssl
18 |         self.output_dir = output_dir
19 |         self.timeout = timeout
20 |         self.retry_standoff = retry_standoff
21 |         self.max_retries = max_retries
22 |         self.result_dump = []
23 | 
24 |     def _get(self, path, output_file):
25 |         json_output = self._crawl_jsons(self.api + path)
26 |         if output_file:
27 |             utils.dump_json(json_output, output_file)
28 |         return json_output
29 | 
30 |     def get_categories(self, output_file=None):
31 |         if not output_file:
32 |             output_file = os.path.join(self.output_dir, "cats.json")
33 |         return self._get('/categories', output_file)
34 | 
35 |     def get_tags(self, output_file=None):
36 |         if not output_file:
37 |             output_file = os.path.join(self.output_dir, "tags.json")
38 |         return self._get('/tags', output_file)
39 | 
40 |     def get_posts(self, output_file=None):
41 |         if not output_file:
42 |             output_file = os.path.join(self.output_dir, "posts.json")
43 |         return self._get('/posts', output_file)
44 | 
45 |     def _isjsonarray(self, json):
46 |         return json and isinstance(json, list)
47 | 
48 |     def set_output_dir(self, output_dir):
49 |         self.output_dir = output_dir
50 | 
51 |     def _crawl_jsons(self, url):
52 |         output = []
53 |         i = 1
54 |         while True:
55 |             urls = ['{}?per_page={}&page={}'.format(url, 1, x) for x in range(i, i+self.crawl_rate, 1)]
56 |             with Pool(self.crawl_rate) as p:
57 |                 json_responses = p.map(self._get_json_response, urls)
58 |             json_responses = list(filter(None, json_responses))
59 |             if self._isjsonarray(json_responses):
60 |                 output += json_responses
61 |                 i += self.crawl_rate
62 |             else:
63 |                 break
64 |         return output
65 | 
66 |     def _get_json_response(self, url):
67 |         retries = 1
68 |         while retries <= self.max_retries:
69 |             print("attempt #{} of {} - {}".format(retries, self.max_retries, url))
70 |             try:
71 |                 response = requests.get(url, headers=self.headers, timeout=self.timeout,
72 |                                         verify=self.verify_ssl)  # 30 seconds
73 |                 print('response code: {}'.format(response.status_code))
74 |                 print('response head: {}'.format(response.text[:300]))
75 |                 # some API return valid response despite code 400 (weird I know)
76 |                 if response.status_code <= 400:
77 |                     return json.loads(response.content.decode('utf-8').strip('\n').strip(' '))[0]
78 |                 else:
79 |                     print("status code returned {}".format(response.status_code))
80 |             except requests.Timeout:
81 |                 print("Timed out.")
82 |             except Exception as e:
83 |                 print("Exception occurred:\nError Type: {}\nDetails: {}".format(type(e), str(e)))
84 |             retries += 1
85 |             print("waiting for {} seconds...".format(self.retry_standoff))
86 |             time.sleep(self.retry_standoff)
87 |         print("max no. of retries reached. exiting...")
88 |         return None
89 | 


--------------------------------------------------------------------------------
/legacy/crawler/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | 
 5 | 
 6 | def ensure_file_directory(file):
 7 |     inception(os.path.dirname(os.path.realpath(file)))
 8 | 
 9 | 
10 | def inception(directory):
11 |     parent_directory = os.path.dirname(directory)
12 |     if not os.path.exists(parent_directory):
13 |         inception(parent_directory)
14 |     if not os.path.exists(directory):
15 |         os.makedirs(directory)
16 | 
17 | 
18 | def dump_json(json_object,output_file):
19 |     ensure_file_directory(output_file)
20 |     with open(output_file, 'w') as f:
21 |         f.write(json.dumps(json_object))
22 | 
23 | 
24 | def remove_leading_scripts(response_text):
25 |     cleaned_text = re.search(r'(\[.*)', response_text)
26 |     return cleaned_text.group(1)
27 | 


--------------------------------------------------------------------------------
/legacy/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoloSynth1/wordpress-scraper/61c1c832fe072b6e692e0c4c27c66d7dd2fc9920/legacy/scripts/__init__.py


--------------------------------------------------------------------------------
/legacy/scripts/activecyber.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.activecyber.net"
 5 | output_dir = os.path.join('.', 'data', 'activecyber.net')
 6 | headers = {
 7 |     'Accept': 'application/json, text/javascript, */*; q=0.01',
 8 |     'Accept-Encoding': '*',
 9 |     'Accept-Language': 'zh-CN,zh;q=0.8',
10 |     'Host': 'www.activecyber.net',
11 |     'Connection': 'keep-alive',
12 |     'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |     'X-Requested-With': 'XMLHttpRequest'
15 | }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/andrewhay.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.andrewhay.ca"
 5 | output_dir = os.path.join('.', 'data', 'andrewhay.ca')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.244034378.1540788304; _gid=GA1.2.222331079.1540788304',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/australiancybersecuritymagazine.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://australiancybersecuritymagazine.com.au"
 5 | output_dir = os.path.join('.', 'data', 'australiancybersecuritymagazine.com.au')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'australiancybersecuritymagazine.com.au',
11 |         'Connection':'keep-alive',
12 |         'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/betanews.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://betanews.com"
 5 | output_dir = os.path.join('.', 'data', 'betanews.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.529771702.1541147410; __gads=ID=cddffd965b59ec5f:T=1541147412:S=ALNI_MYQm3Vphovvte59_MWZctfuIFxpmg; zdbb_swap_krux_id=1; _gid=GA1.2.1535543751.1541383876; geoCC=HK',
11 |         'Host': 'betanews.com',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/bromium.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "http://www.bromium.com"
 5 | output_dir = os.path.join('.', 'data', 'bromium.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'www.bromium.com',
11 |         'Connection':'keep-alive',
12 |         'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/cfobase.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://cfobase.com/"
 5 | output_dir = os.path.join('.', 'data', 'cfobase.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'cfobase.com',
11 |         'Connection':'keep-alive',
12 |         'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/chinainternetwatch.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.chinainternetwatch.com"
 5 | output_dir = os.path.join('.', 'data', 'chinainternetwatch.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/comparitech.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.comparitech.com"
 5 | output_dir = os.path.join('.', 'data', 'www.comparitech.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_conv_r=s:heimdalsecurity.com*m:referral*t:*c:; _ga=GA1.2.235624317.1541147443; PHPSESSID=4865b95b6bd5a85c6e071c1e2f47497f; _gid=GA1.2.595733218.1541383875; _conv_v=vi:1*sc:4*cs:1541383837*fs:1541147440*pv:5*exp:{}*ps:1541221464; _conv_s=si:4*sh:1541383837127-0.002642356216356223*pv:2; _ceg.s=php9gm; _ceg.u=php9gm',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/crawled_list.csv:
--------------------------------------------------------------------------------
 1 | url,output_dir
 2 | "https://australiancybersecuritymagazine.com.au","./data/australiancybersecuritymagazine.com.au"
 3 | "https://cyberriskleaders.com","./data/cyberriskleaders.com"
 4 | "https://techcrunch.com","./data/techcrunch.com"
 5 | "https://www.wired.com","./data/wired.com"
 6 | "https://www.varonis.com/blog","./data/www.varonis.com"
 7 | "https://www.asiapacificsecuritymagazine.com","./data/www.asiapacificsecuritymagazine.com"
 8 | "https://www.nozominetworks.com","./data/www.nozominetworks.com"
 9 | "https://www.delapcpa.com","./data/delapcyber.com"
10 | "https://hub.packtpub.com","./data/hub.packtpub.com"
11 | "https://irishtechnews.ie","./data/irishtechnews.ie"
12 | "https://amuedge.com","./data/incyberdefense.com"
13 | "https://www.incidentresponse.com","./data/www.incidentresponse.com"
14 | "https://www.informationsecuritybuzz.com/","./data/www.informationsecuritybuzz.com"
15 | "https://www.cbronline.com","./data/www.cbronline.com"
16 | "https://www.cyberscoop.com","./data/www.cyberscoop.com"
17 | "https://www.fedscoop.com","./data/www.fedscoop.com"
18 | "https://technode.com","./data/technode.com"
19 | "https://www.pymnts.com","./data/www.pymnts.com"
20 | "https://www.tripwire.com/state-of-security","./data/tripwire.com"
21 | "https://www.scmagazine.com","./data/www.scmagazine.com"
22 | "https://arstechnica.com","./data/arstechnica.com"
23 | "https://www.rd.com","./data/rd.com"
24 | "https://blog.mozilla.org","./data/blog.mozilla.org"
25 | "https://pulse.target.com","./data/pulse.target.com"
26 | "https://hackaday.com","./data/hackaday.com"
27 | "http://blog.ecocn.org","./data/blog.ecocn.org"
28 | "https://time.com","./data/time.com"
29 | "https://www.itworldcanada.com/","./data/www.itworldcanada.com"
30 | "https://www.sfexaminer.com","./data/sfexaminer.com"
31 | "https://cybersecuritynews.com","./data/cybersecuritynews.com"
32 | "https://cybersecurityventures.com","./data/cybersecurityventures.com"
33 | "https://www.innovationnewsnetwork.com","./data/innovationnewsnetwork.com"
34 | "https://www.telos.com","./data/telos.com"
35 | 
36 | 


--------------------------------------------------------------------------------
/legacy/scripts/cuphk.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.cup.com.hk"
 5 | output_dir = os.path.join('.', 'data', 'cup.com.hk')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__cfduid=d0c4282dcdb115dbcd08edfba7e0e8e721560849079',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/cybersecuritynews.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://cybersecuritynews.co.uk/"
 5 | output_dir = os.path.join('.', 'data', 'cybersecuritynews.co.uk')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__cfduid=d168be0b4533032d20c2aa278ecb62fe31541147413; check=true; _ga=GA1.2.130067401.1541147420; _gid=GA1.2.1176639717.1541147420; bd112=Xc5BCsIwEAXQu2ThShICEbES3Lj1BEYkbadNoE3CZGoV8e622enqD58H869vNuHAKuaIUmWEEfM889pTCx2EFpA3cTQiewIjLtYHI3oIgHY4%2B97TEpbsKWkjNnZMR9KrvLs4QrI9lK4dNIRyOe0iqQwNx%2FrJ48P9PApAReVVdREXN6Gn1%2F%2BaggBRsy0jP0KmpWCV3Ckp1V7Jw%2Bf2BQ%3D%3D; AMCVS_0E920C0F53DA9E9B0A490D45%40AdobeOrg=1; AMCV_0E920C0F53DA9E9B0A490D45%40AdobeOrg=2035320058%7CMCIDTS%7C17838%7CMCMID%7C26407325438182498473478927876586191970%7CMCAAMLH-1541752221%7C11%7CMCAAMB-1541752221%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1541154621s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C3.3.0; mbox=session#8b43bd7f0847437182a44000418828d2#1541149284|PC#8b43bd7f0847437182a44000418828d2.22_35#1604392224; s_cc=true; aam_uuid=26426712967223226083480932539320192677; s_ht=1541147427592; s_hc=1%7C0%7C0%7C0%7C0; s_ppvl=h4s%253Ahome%2C29%2C29%2C1009%2C1920%2C1009%2C1920%2C1080%2C1%2CP; s_ppn=h4s%3Ahome; s_ppv=h4s%253Ahome%2C84%2C29%2C2946%2C1920%2C1009%2C1920%2C1080%2C1%2CP',
11 |         'Connection':'keep-alive',
12 |         'Host': 'cybersecuritynews.co.uk',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/cybersecurityreview.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.cybersecurity-review.com"
 5 | output_dir = os.path.join('.', 'data', 'cybersecurity-review.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.910557633.1540266945; _gid=GA1.2.1879377337.1541147383',
11 |         'Host': 'www.cybersecurity-review.com',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/cyberthreat.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "http://cyberthreat.blog"
 5 | output_dir = os.path.join('.', 'data', 'cyberthreat.blog')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Connection':'keep-alive',
11 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
12 |         'X-Requested-With':'XMLHttpRequest'
13 |         }
14 | 
15 | if __name__ == "__main__":
16 |     from legacy import common_crawl
17 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
18 |     common_crawl(wpc)
19 | 


--------------------------------------------------------------------------------
/legacy/scripts/helpnetsecurity.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.helpnetsecurity.com"
 5 | output_dir = os.path.join('.', 'data', 'helpnetsecurity.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.910557633.1540266945; _gid=GA1.2.1879377337.1541147383',
11 |         'Host': 'www.helpnetsecurity.com',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/hotforsecurity.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://hotforsecurity.bitdefender.com"
 5 | output_dir = os.path.join('.', 'data', 'hotforsecurity.bitdefender.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__cfduid=d168be0b4533032d20c2aa278ecb62fe31541147413; check=true; _ga=GA1.2.130067401.1541147420; _gid=GA1.2.1176639717.1541147420; bd112=Xc5BCsIwEAXQu2ThShICEbES3Lj1BEYkbadNoE3CZGoV8e622enqD58H869vNuHAKuaIUmWEEfM889pTCx2EFpA3cTQiewIjLtYHI3oIgHY4%2B97TEpbsKWkjNnZMR9KrvLs4QrI9lK4dNIRyOe0iqQwNx%2FrJ48P9PApAReVVdREXN6Gn1%2F%2BaggBRsy0jP0KmpWCV3Ckp1V7Jw%2Bf2BQ%3D%3D; AMCVS_0E920C0F53DA9E9B0A490D45%40AdobeOrg=1; AMCV_0E920C0F53DA9E9B0A490D45%40AdobeOrg=2035320058%7CMCIDTS%7C17838%7CMCMID%7C26407325438182498473478927876586191970%7CMCAAMLH-1541752221%7C11%7CMCAAMB-1541752221%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1541154621s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C3.3.0; mbox=session#8b43bd7f0847437182a44000418828d2#1541149284|PC#8b43bd7f0847437182a44000418828d2.22_35#1604392224; s_cc=true; aam_uuid=26426712967223226083480932539320192677; s_ht=1541147427592; s_hc=1%7C0%7C0%7C0%7C0; s_ppvl=h4s%253Ahome%2C29%2C29%2C1009%2C1920%2C1009%2C1920%2C1080%2C1%2CP; s_ppn=h4s%3Ahome; s_ppv=h4s%253Ahome%2C84%2C29%2C2946%2C1920%2C1009%2C1920%2C1080%2C1%2CP',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/informationage.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.information-age.com"
 5 | output_dir = os.path.join('.', 'data', 'information-age.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'www.information-age.com',
11 |         'Connection':'keep-alive',
12 |         'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/infosecblog.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.infosecblog.org"
 5 | output_dir = os.path.join('.', 'data', 'infosecblog.org')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__cfduid=dfc1c22a9bb42efd57b3c2231d0c43d2a1540799668; _ga=GA1.2.958394734.1540799670; _gid=GA1.2.1380839925.1540799670',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/itsecurity.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "http://itsecurity.co.uk"
 5 | output_dir = os.path.join('.', 'data', 'itsecurity.co.uk')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'itsecurity.co.uk',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/lastwatchdog.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.lastwatchdog.com"
 5 | output_dir = os.path.join('.', 'data', 'lastwatchdog')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'cookie':'__cfduid=dfea0ee9fe9b8670fdd8222535adb9ec21540527308; __unam=c02b1ec-166acddd608-734d3c03-1',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/liquidmatrix.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.liquidmatrix.org/blog"
 5 | output_dir = os.path.join('.', 'data', 'liquidmatrix.org')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__atuvc=1%7C44; __atuvs=5bd6b92398768410000',
11 |         'Host': 'www.liquidmatrix.org',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/lookingglasscyber.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.lookingglasscyber.com"
 5 | output_dir = os.path.join('.', 'data', 'lookingglasscyber')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Connection':'keep-alive',
11 |         'Cookie':'__cfduid=d8bd3b5ad23455ce8b26bcbb869ff80c31540451497',
12 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/malwarebytes.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://blog.malwarebytes.com"
 5 | output_dir = os.path.join('.', 'data', 'blog.malwarebytes.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1230427134.1541147423; _gid=GA1.2.1527966877.1541383870; _fbp=fb.1.1541468793370.261995265',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/martinoei.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://martinoei.com"
 5 | output_dir = os.path.join('.', 'data', 'martinoei.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/riskiq.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.riskiq.com"
 5 | output_dir = os.path.join('.', 'data', 'riskiq.com')
 6 | 
 7 | headers = {
 8 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 9 |         'Accept-Encoding':'gzip, deflate, sdch, br',
10 |         'Accept-Language':'zh-CN,zh;q=0.8',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/robertpenz.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://robert.penz.name"
 5 | output_dir = os.path.join('.', 'data', 'robert.penz.name')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': 'pay_ent_smp=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCIsInZlciI6MX0.eyJ1cmxzIjpbIi8yMDAxLzAzL2hvbmV5cG90cy1iYWl0LWZvci10aGUtY3JhY2tlciJdLCJjbnQiOjEsIm1heCI6NCwiZXhwIjoyMDE4MTB9.SvGXeHRGxLna6rX9Kr9qTXrSU-pSbIkrDE_hQwQ60O4; CN_xid=d75239c8-659c-476c-9f5d-ea0547275e55; _sdsat_landing_page=https://www.wired.com/2001/03/honeypots-bait-for-the-cracker/|1540044766455; _sdsat_session_count=1; _sdsat_traffic_source=https://www.google.com.hk/; visitedCount_jwt=1; AMCVS_F7093025512D2B690A490D44%40AdobeOrg=1; CN_sp=fa58760d-4e72-4ce3-b443-2c9b42243770; CN_su=d19800c0-59da-4191-a88c-5c4148ebb692; CN_segments=; _ga=GA1.2.802886981.1540044768; fpcid=2456436582056134046_FP; v30=google.com.hk; v39=google.com.hk; s_cc=true; __gads=ID=f8c89cd159acd50f:T=1540044768:S=ALNI_MainR0wk-mflQoNeN_UO7dory-7gQ; aamconde=conde%3Dsv%3BCN%3D764985; aam_optimizely=aam%3D226821; aam_uuid=26426712967223226083480932539320192677; _sdsat_lt_pages_viewed=2; _sdsat_pages_viewed=2; _sdsat_AAM_UUID=26426712967223226083480932539320192677; CN_visits_m=1541001600572%26vn%3D2; CN_in_visit_m=true; sID=2955f7c3-91dd-4bc7-abc7-999968ecee3c; pID=1d3e3648-9926-466b-beec-6a2e98c8702c; AMCV_F7093025512D2B690A490D44%40AdobeOrg=1099438348%7CMCIDTS%7C17834%7CMCMID%7C26574736731012813853459705698242364028%7CMCAAMLH-1541404553%7C3%7CMCAAMB-1541404553%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1540806953s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C2.1.0; s_vnum_m=1541001600667%26vn%3D2; sinvisit_m=true; s_depth=1; timeSpent=1540799753187; s_ppn=https%3A%2F%2Fwww.wired.com%2Fcategory%2Fsecurity%2Fthreatlevel%2F; s_pct=Index; s_nr=1540799753188-Repeat; sailthru_pageviews=1; bounceClientVisit2825v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgO6kB0xAlgE4CmAJmQMYD2AtkUwIYK0DmLagE8iCOHR5haAN1pgiIADQhqMECAC+QA; _polar_tu=*_%22mgtn%22_@2Q_u_@_97f78f97-5c77-4716-b78f-a0dccc974ab0_Q_n_@3Q_s_@2Q_sc_@*_v_@1Q_a_@1+Q_ss_@_%22phcop7_Q_sl_@_%22phcop7_Q_sd_@*+Q_v_@nullQ_vc_@*_e_@0+Q_vs_@_%22phcop7_Q_vl_@_%22phcop7_Q_vd_@*+Q_vu_@_555fdf068442e929ddada46236b2ea5b_Q_vf_@_%22jnu0e179_+; _parsely_session={%22sid%22:2%2C%22surl%22:%22https://www.wired.com/category/threatlevel/%22%2C%22sref%22:%22%22%2C%22sts%22:1540799755083%2C%22slts%22:1540044768270}; _parsely_visitor={%22id%22:%22c609b887-dad6-414c-906e-f6a107dbb880%22%2C%22session_count%22:2%2C%22last_session_ts%22:1540799755083}; sailthru_content=e43720c11f5345e88d86bc1d5be31f74e2553d06f8d9ea3b9cb7420abe100f46; sailthru_visitor=1cb98baf-6809-4646-a0f8-aa82685e000a; AMP_TOKEN=%24NOT_FOUND; _gid=GA1.2.368334264.1540799760',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/scmagazine.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.scmagazine.com"
 5 | output_dir = os.path.join('.', 'data', 'www.scmagazine.com')
 6 | 
 7 | headers = {
 8 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Connection':'keep-alive',
11 |         'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
12 |         'X-Requested-With':'XMLHttpRequest'
13 |         }
14 | 
15 | if __name__ == "__main__":
16 |     from legacy import common_crawl
17 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
18 |     common_crawl(wpc)
19 | 


--------------------------------------------------------------------------------
/legacy/scripts/securelist.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://securelist.com"
 5 | output_dir = os.path.join('.', 'data', 'securelist.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1769368666.1545970795; _hjIncludedInSample=1; _gid=GA1.2.240454939.1546227015; _fbp=fb.1.1546227015921.374506551; PHPSESSID=a856739d604f1496cd355f2dc35f3371',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=5)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/securingtomorrow.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.mcafee.com/blogs"
 5 | output_dir = os.path.join('.', 'data', 'securingtomorrow.mcafee.com')
 6 | 
 7 | headers = {
 8 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Connection':'keep-alive',
11 |         'cookie':'utag_main=_st:1540559125612$ses_id:1540558064662%3Bexp-session; PHPSESSID=99p4mht3963nnm4649bhdhcna1',
12 |         'Host': 'www.mcafee.com',
13 |         'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/securityaffairs.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://securityaffairs.co/wordpress"
 5 | output_dir = os.path.join('.', 'data', 'securityaffairs.co-wordpress')
 6 | 
 7 | headers = {
 8 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 9 |         'Accept-Encoding':'gzip, deflate, sdch, br',
10 |         'Accept-Language':'zh-CN,zh;q=0.8',
11 |         'Connection':'keep-alive',
12 |         'cookie':'__sharethis_cookie_test__=1; __unam=6f69f6a-166b1935d01-3a484fa3-3',
13 |         'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/securityboulevard.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import improved_crawler
 2 | import os
 3 | 
 4 | url = "https://securityboulevard.com"
 5 | output_dir = os.path.join('.', 'data', 'securityboulevard.com')
 6 | headers = {
 7 |     'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |     'Accept-Encoding':'*',
 9 |     'Accept-Language':'en-US,en;q=0.9,zh-CN,zh;q=0.8',
10 |     'Cache-control': 'max-age=0',
11 |     'Cookie': 'timer=3; lastvisit=1618499885',
12 |     'Dnt': '1',
13 |     'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
14 | }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = improved_crawler.MultiThreadedCrawler(url, headers, output_dir, crawl_rate=20, retry_standoff=30, max_retries=5)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/securityledger.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://securityledger.com"
 5 | output_dir = os.path.join('.', 'data', 'securityledger.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__cfduid=df69a10a216bf3a67efa57115a197bbda1540799737; __sharethis_cookie_test__=1; __unam=7639673-166bed22de8-519e7a93-1; __qca=P0-1129919385-1540799738659; _ga=GA1.2.963329934.1540799739; _gid=GA1.2.1250730364.1540799739',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/securityweekly.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://securityweekly.com"
 5 | output_dir = os.path.join('.', 'data', 'securityweekly.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1656740477.1540263278; _gid=GA1.2.1979594887.1540799746',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/sensorstechforum.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://sensorstechforum.com"
 5 | output_dir = os.path.join('.', 'data', 'sensorstechforum.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '__cfduid=d6e8d60c5ed6e3694722eda50a248c54a1541577018; PHPSESSID=824147b5f2c890a2dc93bfbaccf9e157; cookiescriptaccept=visit',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/socialengineer.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.social-engineer.org"
 5 | output_dir = os.path.join('.', 'data', 'social-engineer.org')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': 'PHPSESSID=66j0js30oq2j43o59fa278pti1; wordpress_test_cookie=WP+Cookie+check; __unam=fea26eb-166bed13d3b-2f1cc5de-1; _ga=GA1.2.1491302686.1540799678; _gid=GA1.2.157321464.1540799678',
11 |         'Host': 'www.social-engineer.org',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/sucuri.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://blog.sucuri.net"
 5 | output_dir = os.path.join('.', 'data', 'blog.sucuri.net')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1452495584.1541147436; IR_gbd=sucuri.net; _gid=GA1.2.959211258.1541383890; IR_3713=1541559256293%7C0%7C1541559256293; IR_PI=3df49081-c233-9d54-c728-932176615ccd%7C1541645656293',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/techcrunch.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://techcrunch.com"
 5 | output_dir = os.path.join('.', 'data', 'techcrunch.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': 'GUCS=AZbgFVVR; BX=dh9q691dto2nb&b=3&s=pa; GUC=AQEBAQFb3VFcqEIeDwRg&s=AQAAAHrL6bS1&g=W9wK9Q; rxx=1igbwm2xq2h.1b7gr4g0&v=1; __pcvc={}; _parsely_session={%22sid%22:1%2C%22surl%22:%22https://techcrunch.com/tag/security/%22%2C%22sref%22:%22https://heimdalsecurity.com/blog/best-internet-security-blogs/%22%2C%22sts%22:1541147372572%2C%22slts%22:0}; _ga=GA1.2.1478942211.1541147373; _gid=GA1.2.641246751.1541147373; _fbp=fb.1.1541147372697.1423203867; _parsely_visitor={%22id%22:%22pid=ec7dcbdb884243adc40612940d2620db%22%2C%22session_count%22:1%2C%22last_session_ts%22:1541147372572}; __tbc=%7Bjzx%7DjGAToaZMxJYLoS7N4KRjDSIBQx5XEUgUJ2gvwnLO_mQfpRr8AZeO09jGk7-hTUyQLRad9Q5Lm3MJbSMChjRhiKNoGVtYYENGn5Vzh_MxqnGS8ho0EJnccNxVf1vgcsspOzaeIILFhlSTw5lulLLvTw; __pat=-14400000; __pvi=%7B%22id%22%3A%22v-2018-11-02-16-29-33-940-6TjxfdUCVZ3f1YuQ-2f3e0384dd8f3212b5d9c2020699090e%22%2C%22domain%22%3A%22.techcrunch.com%22%2C%22time%22%3A1541147374314%7D; xbc=%7Bjzx%7D_e5BhmMRjtsjUjGbrtX0c7h7R3xM4VwABoRuGqnXs1Ch_5rnoQguNFJyyQn8ud-iL8IqeSU80X8vYQUgLUhfHjuT7rlXqV_haFtcBrb8yuIRISMjwJMKlsrJHHm5uxsQSNVe4coFP2tX0siiAgGZ0F02N2xHWSErhk02CjXpVfgMydpMnNyxptMGZ-xXMB5A; __adblocker=false',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/techlear.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.techlear.com/blog"
 5 | output_dir = os.path.join('.', 'data', 'techlear.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'www.techlear.com',
11 |         'Connection':'keep-alive',
12 |         'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/threatpost.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://threatpost.com/"
 5 | output_dir = os.path.join('.', 'data', 'threatpost')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.908572152.1540264737; __gads=ID=b803df93913bd567:T=1540264741:S=ALNI_MaUpeM35RC42f9YZMJE4x8eKpPo-Q; _gid=GA1.2.1096157912.1540537307; _gat_UA-35676203-21=1; _fbp=fb.1.1540537307445.696691226; _gat_gtag_UA_109681207_2=1',
11 |         'Host': 'threatpost.com',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/tripwire.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.tripwire.com/state-of-security"
 5 | output_dir = os.path.join('.', 'data', 'tripwire.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': 'ASP.NET_SessionID=rno4pepppbkdh2f1bn5nc2kj; SC_ANALYTICS_GLOBAL_COOKIE=f9c357b4f7124f7890bd980c59b97db4|False; active=yes; Set_Me=3415655320.1.1933537720.2330721216; SnapABugRef=https%3A%2F%2Fwww.tripwire.com%2F%20; SnapABugHistory=1#; SnapABugVisit=1#1541675065',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | if __name__ == "__main__":
17 |     from legacy import common_crawl
18 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=25)
19 |     common_crawl(wpc)
20 | 


--------------------------------------------------------------------------------
/legacy/scripts/trustedsec.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.trustedsec.com"
 5 | output_dir = os.path.join('.', 'data', 'trustedsec.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Host': 'www.trustedsec.com',
11 |         'Connection':'keep-alive',
12 |         'Cookie': '_mkto_trk=id:497-ITQ-712&token:_mch-bromium.com-1545360431008-12984; _biz_sid=919162; _ga=GA1.2.1945164141.1545360432; _gid=GA1.2.949409536.1545360432; _biz_uid=a18ee78fd1664b1ecb82da38af7ca704; _biz_flagsA=%7B%22Version%22%3A1%2C%22Mkto%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _biz_nA=4; _biz_pendingA=%5B%5D',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy/scripts/unwire.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://unwire.pro"
 5 | output_dir = os.path.join('.', 'data', 'unwire.pro')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Connection':'keep-alive',
11 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
12 |         }
13 | 
14 | if __name__ == "__main__":
15 |     from legacy import common_crawl
16 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=5)
17 |     common_crawl(wpc)
18 | 


--------------------------------------------------------------------------------
/legacy/scripts/vipre.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.vipre.com"
 5 | output_dir = os.path.join('.', 'data', 'www.vipre.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': '_ga=GA1.2.1644535100.1541147443; __adroll_fpc=3a119bf44db7ee75d21bdc4bd8006d25; __idcontext=eyJjb29raWVJRCI6IjZFM0I3VUNSNDVOWVZHT1o3WTJGSDZOT1dXRUtFMkM0TlhKNUdBVDdIVkdRPT09PSIsImRldmljZUlEIjoiNkUzQjdVQ1I0QjNKRlg2WFZZSVdMNjcyU1M0SllNU0dKTEc1T0FDR0c0RUE9PT09IiwiaXYiOiI1SVdKNlpMUlBKN0lNNlcyVkpWMjZQRzJKST09PT09PSIsInYiOjF9; _gid=GA1.2.1952222110.1541383857; _gcl_au=1.1.535369970.1541532633; __ar_v4=R5EX2LAD7FAOVD6PWNPH6O%3A20181102%3A10%7CN7AGIAEPRZDM5FMAGV2QUY%3A20181102%3A10%7C53FLNYE57ZE4ZDAHOYANNY%3A20181102%3A10',
11 |         'Host': 'www.vipre.com',
12 |         'Connection':'keep-alive',
13 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
14 |         'X-Requested-With':'XMLHttpRequest'
15 |         }
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     from legacy import common_crawl
20 |     wpc = crawler.WordPressCrawler(url, headers, output_dir, crawl_rate=100, verify_ssl=False)
21 |     common_crawl(wpc)
22 | 


--------------------------------------------------------------------------------
/legacy/scripts/wired.py:
--------------------------------------------------------------------------------
 1 | from legacy.crawler import crawler
 2 | import os
 3 | 
 4 | url = "https://www.wired.com"
 5 | output_dir = os.path.join('.', 'data', 'wired.com')
 6 | headers = {
 7 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 8 |         'Accept-Encoding':'*',
 9 |         'Accept-Language':'zh-CN,zh;q=0.8',
10 |         'Cookie': 'pay_ent_smp=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCIsInZlciI6MX0.eyJ1cmxzIjpbIi8yMDAxLzAzL2hvbmV5cG90cy1iYWl0LWZvci10aGUtY3JhY2tlciJdLCJjbnQiOjEsIm1heCI6NCwiZXhwIjoyMDE4MTB9.SvGXeHRGxLna6rX9Kr9qTXrSU-pSbIkrDE_hQwQ60O4; CN_xid=d75239c8-659c-476c-9f5d-ea0547275e55; _sdsat_landing_page=https://www.wired.com/2001/03/honeypots-bait-for-the-cracker/|1540044766455; _sdsat_session_count=1; _sdsat_traffic_source=https://www.google.com.hk/; visitedCount_jwt=1; AMCVS_F7093025512D2B690A490D44%40AdobeOrg=1; CN_sp=fa58760d-4e72-4ce3-b443-2c9b42243770; CN_su=d19800c0-59da-4191-a88c-5c4148ebb692; CN_segments=; _ga=GA1.2.802886981.1540044768; fpcid=2456436582056134046_FP; v30=google.com.hk; v39=google.com.hk; s_cc=true; __gads=ID=f8c89cd159acd50f:T=1540044768:S=ALNI_MainR0wk-mflQoNeN_UO7dory-7gQ; aamconde=conde%3Dsv%3BCN%3D764985; aam_optimizely=aam%3D226821; aam_uuid=26426712967223226083480932539320192677; _sdsat_lt_pages_viewed=2; _sdsat_pages_viewed=2; _sdsat_AAM_UUID=26426712967223226083480932539320192677; CN_visits_m=1541001600572%26vn%3D2; CN_in_visit_m=true; sID=2955f7c3-91dd-4bc7-abc7-999968ecee3c; pID=1d3e3648-9926-466b-beec-6a2e98c8702c; AMCV_F7093025512D2B690A490D44%40AdobeOrg=1099438348%7CMCIDTS%7C17834%7CMCMID%7C26574736731012813853459705698242364028%7CMCAAMLH-1541404553%7C3%7CMCAAMB-1541404553%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1540806953s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C2.1.0; s_vnum_m=1541001600667%26vn%3D2; sinvisit_m=true; s_depth=1; timeSpent=1540799753187; s_ppn=https%3A%2F%2Fwww.wired.com%2Fcategory%2Fsecurity%2Fthreatlevel%2F; s_pct=Index; s_nr=1540799753188-Repeat; sailthru_pageviews=1; bounceClientVisit2825v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgO6kB0xAlgE4CmAJmQMYD2AtkUwIYK0DmLagE8iCOHR5haAN1pgiIADQhqMECAC+QA; _polar_tu=*_%22mgtn%22_@2Q_u_@_97f78f97-5c77-4716-b78f-a0dccc974ab0_Q_n_@3Q_s_@2Q_sc_@*_v_@1Q_a_@1+Q_ss_@_%22phcop7_Q_sl_@_%22phcop7_Q_sd_@*+Q_v_@nullQ_vc_@*_e_@0+Q_vs_@_%22phcop7_Q_vl_@_%22phcop7_Q_vd_@*+Q_vu_@_555fdf068442e929ddada46236b2ea5b_Q_vf_@_%22jnu0e179_+; _parsely_session={%22sid%22:2%2C%22surl%22:%22https://www.wired.com/category/threatlevel/%22%2C%22sref%22:%22%22%2C%22sts%22:1540799755083%2C%22slts%22:1540044768270}; _parsely_visitor={%22id%22:%22c609b887-dad6-414c-906e-f6a107dbb880%22%2C%22session_count%22:2%2C%22last_session_ts%22:1540799755083}; sailthru_content=e43720c11f5345e88d86bc1d5be31f74e2553d06f8d9ea3b9cb7420abe100f46; sailthru_visitor=1cb98baf-6809-4646-a0f8-aa82685e000a; AMP_TOKEN=%24NOT_FOUND; _gid=GA1.2.368334264.1540799760',
11 |         'Connection':'keep-alive',
12 |         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/70.0.3538.67 Chrome/70.0.3538.67 Safari/537.36',
13 |         'X-Requested-With':'XMLHttpRequest'
14 |         }
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     from legacy import common_crawl
19 |     wpc = crawler.WordPressCrawler(url, headers, output_dir)
20 |     common_crawl(wpc)
21 | 


--------------------------------------------------------------------------------
/legacy_crawl_all.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | from multiprocessing import Pool
 4 | import argparse
 5 | 
 6 | from legacy_main import crawl
 7 | 
 8 | threads = 10
 9 | crawl_rate = 5
10 | 
11 | 
12 | def multiple_crawl(jobs):
13 |     flattened_jobs = map(lambda x: (x['url'], x['output_dir'], crawl_rate), jobs)
14 |     with Pool(threads) as p:
15 |         p.starmap(crawl, flattened_jobs)
16 | 
17 | 
18 | def parse(csv_path: str):
19 |     if os.path.exists(csv_path):
20 |         with open(csv_path, 'r') as f:
21 |             csvreader = csv.DictReader(f, delimiter=',', quotechar='"')
22 |             return list(csvreader)
23 |     else:
24 |         raise FileNotFoundError()
25 | 
26 | 
27 | def main(csv_file):
28 |     jobs = parse(csv_file)
29 |     multiple_crawl(jobs)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('--csvfile', default="./legacy/scripts/crawled_list.csv")
35 |     args = parser.parse_args()
36 |     main(args.csvfile)
37 | 


--------------------------------------------------------------------------------
/legacy_main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from legacy.crawler import crawler
 3 | from legacy import common_header, common_crawl
 4 | 
 5 | 
 6 | def crawl(url, output_dir, crawl_rate=25):
 7 |     wpc = crawler.WordPressCrawler(url, common_header, output_dir, crawl_rate)
 8 |     common_crawl(wpc)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("url")
14 |     parser.add_argument("output_dir")
15 |     parser.add_argument("--crawl_rate", default=25)
16 |     args = parser.parse_args()
17 |     if args.crawl_rate:
18 |         crawl(args.url, args.output_dir, int(args.crawl_rate))
19 |     else:
20 |         crawl(args.url, args.output_dir)
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pymongo==4.7.1
2 | requests==2.31.0


--------------------------------------------------------------------------------
/wpscraper/connector.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from abc import ABC, abstractmethod
 3 | import hashlib
 4 | import json
 5 | 
 6 | from pymongo import MongoClient
 7 | 
 8 | from wpscraper.document import Document
 9 | 
10 | 
11 | def create_directory(directory):
12 |     parent_directory = os.path.dirname(os.path.realpath(directory))
13 |     if not os.path.exists(parent_directory):
14 |         create_directory(parent_directory)
15 |     if not os.path.exists(directory):
16 |         os.makedirs(directory)
17 | 
18 | 
19 | class Connector(ABC):
20 |     @abstractmethod
21 |     def process_document(self, document: Document, *args, **kwargs):
22 |         pass
23 | 
24 | 
25 | class FileSystemConnector(Connector):
26 |     def __init__(self, folder: str, save_as_individual_files: bool = False):
27 |         self.folder = folder
28 |         self.save_as_individual_files = save_as_individual_files
29 |         create_directory(self.folder)
30 | 
31 |     def process_document(self, document: Document, resource: str):
32 |         json_string = json.dumps(document.data)
33 |         if self.save_as_individual_files:
34 |             self._generate_individual_document(json_string)
35 |         else:
36 |             self._append_resource_document(resource, json_string)
37 | 
38 |     def _generate_individual_document(self, json_string: str):
39 |         filename = hashlib.sha256(json_string.encode('utf-8')).hexdigest()
40 |         file_to_write = os.path.join(self.folder, "{}.json".format(filename))
41 |         with open(file_to_write, 'w') as f:
42 |             f.write(json_string)
43 | 
44 |     def _append_resource_document(self, resource: str, json_string: str):
45 |         file_to_write = os.path.join(self.folder, "{}.json".format(resource))
46 |         with open(file_to_write, 'a') as f:
47 |             f.write(json_string)
48 |             f.write("\n")   # add linebreak at the end
49 | 
50 | 
51 | class MongoDBConnector(Connector):
52 |     def __init__(self, db_host: str, db_port: int, db_database: str, db_collection: str,
53 |                  username: str, password: str, auth_source: str = "admin", auth_mechanism: str = "SCRAM-SHA-256"):
54 |         self.db_host = db_host
55 |         self.db_port = db_port
56 |         self.db_database = db_database
57 |         self.db_collection = db_collection
58 |         self.username = username
59 |         self.password = password
60 |         self.auth_source = auth_source
61 |         self.auth_mechanism = auth_mechanism
62 |         self.client = MongoClient(host=self.db_host, port=self.db_port, username=self.username, password=self.password,
63 |                              authSource=self.auth_source, authMechanism=self.auth_mechanism)
64 | 
65 |     def process_document(self, document: Document, resource: str):
66 |         doc_id = self.client[self.db_database][self.db_collection].insert_one(document=document.data).inserted_id
67 |         if not doc_id:
68 |             raise ConnectionError("Couldn't insert document into MongoDB.")
69 | 


--------------------------------------------------------------------------------
/wpscraper/crawler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | import requests
 6 | 
 7 | from wpscraper.headers import Headers
 8 | 
 9 | 
10 | class Crawler(ABC):
11 |     def __init__(self, url, headers: Headers, verify_ssl: bool, timeout: int, max_retries: int):
12 |         self.api_path = url + '/wp-json/wp/v2/'
13 |         self.headers = headers
14 |         self.verify_ssl = verify_ssl
15 |         self.timeout = timeout
16 |         self.max_retries = max_retries
17 | 
18 |     @abstractmethod
19 |     def crawl(self, resource: str, *args, **kwargs):
20 |         pass
21 | 
22 | 
23 | class SimpleRequestsCrawler(Crawler):
24 |     def __init__(self, url, headers: Headers = None, crawl_rate: int = 25, verify_ssl: bool = True, timeout: int = 30,
25 |                  max_retries: int = 5, constant_retry_standoff: int = 30):
26 |         super().__init__(url=url, headers=headers, verify_ssl=verify_ssl, timeout=timeout, max_retries=max_retries)
27 |         self.crawled_resource_count = {}
28 |         self.crawl_rate = crawl_rate
29 |         self.constant_retry_standoff = constant_retry_standoff
30 | 
31 |     def crawl(self, resource: str):
32 |         documents = []
33 |         if not resource in self.crawled_resource_count.keys():
34 |             self.crawled_resource_count[resource] = 1
35 |         objs = self._get_json_response(
36 |             self.api_path+resource+'?per_page={}&page={}'.format(self.crawl_rate, self.crawled_resource_count[resource])
37 |         )
38 |         if objs and isinstance(objs, list):
39 |             documents = objs
40 |             self.crawled_resource_count[resource] += 1
41 |         else:
42 |             print("No documents are crawled.")
43 |         return documents
44 | 
45 |     def _get_json_response(self, url):
46 |         attempt_count = 1
47 |         while attempt_count < self.max_retries:
48 |             try:
49 |                 response = requests.get(url, headers=self.headers.headers, timeout=self.timeout, verify=self.verify_ssl)
50 |                 print('subpath: {}'.format(url))
51 |                 print('response code: {}'.format(response.status_code))
52 |                 print('response head: {}'.format(response.text[:300]))
53 |                 if response.status_code == 200 or response.status_code == 400:
54 |                     return json.loads(next(response.iter_lines()))
55 |                 else:
56 |                     print("status code returned {}".format(response.status_code))
57 |             except requests.Timeout:
58 |                 print("Timed out.")
59 |             except Exception as e:
60 |                 print("Exception occurred: {}".format(e))
61 |             attempt_count += 1
62 |             print("waiting for {} seconds...".format(self.constant_retry_standoff))
63 |             time.sleep(self.constant_retry_standoff)
64 |             print("retrying... (attempt {} of {})".format(attempt_count, self.max_retries))
65 |         print("max no. of retries reached. exiting...")
66 |         return None
67 | 
68 |     def get_crawled_stat(self):
69 |         return self.crawled_resource_count
70 | 
71 | 


--------------------------------------------------------------------------------
/wpscraper/document.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | 
 5 | class Document(ABC):
 6 |     def __init__(self, raw_data: Any):
 7 |         self.raw_data = raw_data
 8 |         self.data = None
 9 | 
10 |     @abstractmethod
11 |     def process_raw_data(self, *args, **kwargs):
12 |         pass
13 | 
14 |     def __repr__(self):
15 |         return self.data
16 | 
17 | 
18 | class JSONDocument(Document):
19 |     # receive and process raw JSON document from WP-JSON API
20 |     def __init__(self, raw_data: dict, **kwargs):
21 |         super().__init__(raw_data)
22 |         self.process_raw_data(kwargs)
23 | 
24 |     def process_raw_data(self, kwargs: dict):
25 |         self.data = {"data": self.raw_data}
26 |         self.data.update(kwargs)
27 | 
28 | 
29 | class RawDocument(Document):
30 |     def __init__(self, raw_data: dict):
31 |         super().__init__(raw_data)
32 |         self.process_raw_data()
33 | 
34 |     def process_raw_data(self):
35 |         self.data = self.raw_data
36 | 


--------------------------------------------------------------------------------
/wpscraper/headers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class Headers(ABC):
 5 |     @abstractmethod
 6 |     def __init__(self):
 7 |         self.headers = None
 8 | 
 9 |     def __repr__(self):
10 |         return str(self.headers)
11 | 
12 | 
13 | class DefaultHeaders(Headers):
14 |     def __init__(self, domain):
15 |         super().__init__()
16 |         self.headers = {
17 |             'Accept': 'application/json, text/javascript, */*; q=0.01',
18 |             'Accept-Encoding': '*',
19 |             'Accept-Language': 'zh-CN,zh;q=0.8',
20 |             'Host': domain,
21 |             'Connection': 'keep-alive',
22 |             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
23 |             'X-Requested-With': 'XMLHttpRequest'
24 |         }


--------------------------------------------------------------------------------
/wpscraper/session.py:
--------------------------------------------------------------------------------
 1 | from uuid import uuid4
 2 | from typing import List
 3 | import datetime
 4 | 
 5 | from urllib.parse import urlparse
 6 | 
 7 | from wpscraper.connector import Connector, FileSystemConnector
 8 | from wpscraper.crawler import Crawler, SimpleRequestsCrawler
 9 | from wpscraper.headers import DefaultHeaders
10 | from wpscraper.document import JSONDocument
11 | 
12 | VALID_PATHS = [
13 |     'posts',
14 |     'tags',
15 |     'categories'
16 | ]
17 | 
18 | 
19 | def validate_paths(resources: List[str]):
20 |     for path in resources:
21 |         if path not in VALID_PATHS:
22 |             raise NameError('path "{}" is not a valid path.'.format(path))
23 |     return resources
24 | 
25 | 
26 | class CrawlSession:
27 |     def __init__(self, url: str, resources: List[str], session_id: str = str(uuid4())):
28 |         self.session_id = session_id
29 |         self.url = url
30 |         self.domain = urlparse(self.url).netloc
31 |         self.resources = validate_paths(resources)
32 |         self.crawler = None
33 |         self.connectors = []
34 | 
35 |     def set_crawler(self, crawler: Crawler):
36 |         self.crawler = crawler
37 | 
38 |     def add_connector(self, connector: Connector):
39 |         self.connectors.append(connector)
40 | 
41 |     def set_connectors(self, connectors: List[Connector]):
42 |         self.connectors = connectors
43 | 
44 |     def execute(self):
45 |         if not (self.crawler and self.connectors):
46 |             raise AssertionError("No crawler and/or connector is specified.")
47 |         for resource in self.resources:
48 |             while True:
49 |                 raw_documents = self.crawler.crawl(resource=resource)
50 |                 if not raw_documents:
51 |                     break
52 |                 current_timestamp = datetime.datetime.utcnow().isoformat()
53 |                 documents = [JSONDocument(document, resource_type=resource, session_id=self.session_id,
54 |                                           crawledtime=current_timestamp) for document in raw_documents]
55 |                 for document in documents:
56 |                     for connector in self.connectors:
57 |                         connector.process_document(resource=resource, document=document)
58 | 
59 | 
60 | class DefaultCrawlSession(CrawlSession):
61 |     def __init__(self, url: str, session_id: str = str(uuid4())):
62 |         resources = ['posts', 'tags', 'categories']
63 |         super().__init__(url, resources=resources, session_id=session_id)
64 |         headers = DefaultHeaders(self.domain)
65 |         self.crawler = SimpleRequestsCrawler(url=self.url, headers=headers)
66 |         self.connectors = [FileSystemConnector(folder='./data/{}'.format(self.domain), save_as_individual_files=True)]
67 | 


--------------------------------------------------------------------------------
/wpscraper/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | 
4 | # unused function to remove characters before JSON response, might be useful later
5 | def remove_leading_scripts(response_text):
6 |     cleaned_text = re.search(r'(\[.*)', response_text)
7 |     return cleaned_text.group(1)
8 | 


--------------------------------------------------------------------------------