├── .gitignore
├── .gitattributes
├── greatfire-importer.sh
├── find_redundant.py
├── updater.py
├── dnsmasq-conf-maker.sh
└── greatfire-fetcher.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # generated file
2 | *.conf
3 | greatfire.*
4 | greatfire-domains*
5 | # temp files
6 | *~
7 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, in case people don't have core.autocrlf set.
2 | * text eol=lf
3 | 


--------------------------------------------------------------------------------
/greatfire-importer.sh:
--------------------------------------------------------------------------------
1 | set -x
2 | Output="./greatfire-domains.txt"
3 | 
4 | [ -f $Output ] && {
5 |   cat $Output | xargs ./updater.py -a
6 |   python3 ./find_redundant.py | xargs ./updater.py -d
7 | }
8 | 
9 | 


--------------------------------------------------------------------------------
/find_redundant.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | ''' Find domains.txt for redundant item.
 4 | '''
 5 | 
 6 | LEAF = 1
 7 | 
 8 | def main():
 9 |     with open('domains.txt', 'r') as f:
10 |         lines = f.readlines()
11 | 
12 |     # Parse conf file & prepare data structure
13 |     data = {}
14 |     for line in lines:
15 |         if line == '' or line.startswith('#'):
16 |             continue
17 |         domain = line
18 |         labels = domain.split('.')
19 |         labels.reverse()
20 |         data[domain] = labels
21 |     domains = list(data.keys())
22 |     domains.sort(key=lambda k: len(data[k]))
23 | 
24 |     tree = {}
25 |     for domain in domains:
26 |         labels = data[domain]
27 |         node = tree  # Init current node with root node
28 |         for i, label in enumerate(labels):
29 |             isLastLabel = i + 1 == len(labels)
30 |             # Check whether redundant
31 |             if (node == LEAF) or (isLastLabel and label in node):
32 |                 print(f"{domain}")
33 |                 break
34 |             # Create leaf node
35 |             if isLastLabel:
36 |                 node[label] = LEAF
37 |                 break
38 |             # Create branch node
39 |             if label not in node:
40 |                 node[label] = {}
41 |             # Iterate to child node
42 |             node = node[label]
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/updater.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import unicode_literals
 3 | from argparse import ArgumentParser
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = ArgumentParser(description="domains updater from felixonmars")
 7 |     parser.add_argument(
 8 |         '-a', '--add',
 9 |         metavar="DOMAIN",
10 |         nargs="+",
11 |         help='Add one or more new domain(s) (implies -s)',
12 |     )
13 |     parser.add_argument(
14 |         '-d', '--delete',
15 |         metavar="DOMAIN",
16 |         nargs="+",
17 |         help='Remove one or more old domain(s) (implies -s)',
18 |     )
19 |     parser.add_argument(
20 |         '-s', '--sort',
21 |         action='store_true',
22 |         default=True,
23 |         help='Sort the list (default action)',
24 |     )
25 |     parser.add_argument(
26 |         '-f', '--file',
27 |         nargs=1,
28 |         default=["domains.txt"],
29 |         help="Specify the file to update (domains.txt by default)",
30 |     )
31 | 
32 |     options = parser.parse_args()
33 | 
34 |     with open(options.file[0]) as f:
35 |         lines = list(f)
36 | 
37 |     if options.add:
38 |         options.sort = True
39 | 
40 |         for domain in options.add:
41 |             new_line = "%s\n" % domain
42 |             if new_line in lines:
43 |                 print("Domain already exists: " + domain)
44 |             else:
45 |                 print("New domain added: " + domain)
46 |                 lines.append(new_line)
47 | 
48 |     if options.delete:
49 |         options.sort = True
50 | 
51 |         for domain in options.delete:
52 |             target_line = "%s\n" % domain
53 |             if target_line not in lines:
54 |                 print("Failed to remove domain " + domain + ": not found.")
55 |             else:
56 |                 print("Domain removed: " + domain)
57 |                 lines.remove(target_line)
58 | 
59 |     if options.sort:
60 |         lines.sort(key=lambda x: x.lstrip("#"))
61 | 
62 |     with open(options.file[0], "w") as f:
63 |         f.write(''.join(filter(lambda line: line.strip(), lines)))
64 | 


--------------------------------------------------------------------------------
/dnsmasq-conf-maker.sh:
--------------------------------------------------------------------------------
  1 | ConfFile="./dnsmasq-blocklist.conf"
  2 | DefaultDomainList="./domains.txt"
  3 | DomainList="$1"
  4 | 
  5 | [ -f "$DefaultDomainList" ] && DomainListExists=1 || DomainListExists=0
  6 | 
  7 | [ "$1" == "" -a "$DomainListExists" == "0" ] && {
  8 | 	echo "Notice: must cleanup old files before!"
  9 | 	echo "the Default Domain List is $DefaultDomainList"
 10 | 	echo "usage: $0 DOMAIN_LIST_FILE"
 11 | 	exit 1
 12 | }
 13 | 
 14 | [ "$1" == "" -a "$DomainListExists" == "1" ] && {
 15 | 	echo "using DefaultDomainList: $DefaultDomainList"
 16 | 	DomainList=${DefaultDomainList}
 17 | }
 18 | 
 19 | rm -f $ConfFile 2>&1
 20 | 
 21 | echo -n "Only need ipset list? [y/n]:"
 22 | read NoServer
 23 | [ "$NoServer" == "y" -o "$NoServer" == "Y" ] || {
 24 | 	echo -n "Enter your dns server (syntax IP[#port] ):"
 25 | 	read dnsserver
 26 | 	[ "$dnsserver" == "" ] && {
 27 | 		dnsserver="8.8.8.8"
 28 | 		echo "dns server empty, using 8.8.8.8 instead"
 29 | 	}
 30 | 	# in case using ':'
 31 | 	dnsserver=${dnsserver//:/#}
 32 | }
 33 | echo -n "Enter your ipset name:"
 34 | read ipset
 35 | [ "$ipset" == "" ] && {
 36 | 	ipset="UNKNOWN_IPSET_NAME"
 37 | 	echo "ipset name empty!!"
 38 | }
 39 | 
 40 | cat $DomainList | while read SingleDomain
 41 | do
 42 | 	echo "ipset=/$SingleDomain/${ipset}">>$ConfFile
 43 | 	[ "$NoServer" == "y" -o "$NoServer" == "Y" ] || {
 44 | 		echo "server=/$SingleDomain/${dnsserver}">>$ConfFile
 45 | 	}
 46 | done
 47 | 
 48 | ## append special ipset to Conffile, we don't have to exclude dl.google.com anymore
 49 | # this will not add domains to ipset list, pls refer to dnsmasq manpage
 50 | echo "#### Bypass ipset domains">>$ConfFile
 51 | #echo "ipset=/dl.google.com/#">>$ConfFile
 52 | cat <<'EOF' | while read BypassIpsetDomain
 53 | 265.com
 54 | 2mdn.net
 55 | app-measurement.com
 56 | beacons.gcp.gvt2.com
 57 | beacons.gvt2.com
 58 | beacons3.gvt2.com
 59 | c.admob.com
 60 | c.android.clients.google.com
 61 | cache.pack.google.com
 62 | clientservices.googleapis.com
 63 | connectivitycheck.gstatic.com
 64 | csi.gstatic.com
 65 | dl.google.com
 66 | doubleclick.net
 67 | e.admob.com
 68 | fonts.googleapis.com
 69 | fonts.gstatic.com
 70 | google-analytics.com
 71 | googleadservices.com
 72 | googleanalytics.com
 73 | googlesyndication.com
 74 | googletagmanager.com
 75 | googletagservices.com
 76 | imasdk.googleapis.com
 77 | kh.google.com
 78 | khm.google.com
 79 | khm.googleapis.com
 80 | khm0.google.com
 81 | khm0.googleapis.com
 82 | khm1.google.com
 83 | khm1.googleapis.com
 84 | khm2.google.com
 85 | khm2.googleapis.com
 86 | khm3.google.com
 87 | khm3.googleapis.com
 88 | khmdb.google.com
 89 | khmdb.googleapis.com
 90 | media.admob.com
 91 | mediavisor.doubleclick.com
 92 | redirector.gvt1.com
 93 | toolbarqueries.google.com
 94 | update.googleapis.com
 95 | EOF
 96 | do
 97 |     echo "ipset=/$BypassIpsetDomain/#" >>$ConfFile
 98 | done
 99 | 
100 | 


--------------------------------------------------------------------------------
/greatfire-fetcher.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Domain list fetcher from GreatFire Analyzer, use at your own risk
  3 | # and modify parameters if needed
  4 | 
  5 | # Use this script under clean DNS result and net tunnel
  6 | 
  7 | # Copyright (c) 2019-2021 wongsyrone
  8 | 
  9 | import sys
 10 | import traceback
 11 | import re
 12 | import bs4
 13 | import requests
 14 | 
 15 | # https://zh.greatfire.org/analyzer
 16 | # Examples:
 17 | # https://zh.greatfire.org/search/alexa-top-1000-domains?page=8
 18 | # https://zh.greatfire.org/search/domains?page=1091
 19 | 
 20 | # ===============================================================
 21 | # PARAMETERS
 22 | # ===============================================================
 23 | 
 24 | # Whether we should use proxy
 25 | # Configure them if set to 'True'
 26 | UseProxy = False
 27 | 
 28 | # some default values can be overwritten
 29 | 
 30 | # Big page count is determined by DefaultMaxPageCount
 31 | DefaultGetAllPagesEvenIfWeHaveBigPageCount = False
 32 | DefaultMaxPageCount = 100
 33 | DefaultBlockThreshold = 80
 34 | 
 35 | DomainListFileName = 'greatfire-domains.txt'
 36 | 
 37 | if UseProxy:
 38 |     # Should install dependency to support SOCKS protocol
 39 |     myProxies = {
 40 |         'http': 'http://127.0.0.1:1088',
 41 |         'https': 'http://127.0.0.1:1088',
 42 |     }
 43 | else:
 44 |     myProxies = None
 45 | 
 46 | # ===============================================================
 47 | # Do NOT change content below if you don't know the meaning
 48 | # ===============================================================
 49 | AlexaTop1000URL = 'https://zh.greatfire.org/search/alexa-top-1000-domains'
 50 | DomainsURL = 'https://zh.greatfire.org/search/domains'
 51 | BlockedURL = 'https://zh.greatfire.org/search/blocked'
 52 | 
 53 | domainPattern = re.compile(
 54 |     r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|'  # domain pt.1
 55 |     r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|'  # domain pt.2
 56 |     r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+'  # domain pt.3
 57 |     r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$'  # TLD
 58 | )
 59 | 
 60 | # for page count
 61 | # max length of digits from backwards
 62 | hrefPageParamPattern = re.compile(r'(?<=page=)\d+')
 63 | 
 64 | domainDict = {}
 65 | 
 66 | 
 67 | def populate_domain_blockPercent(soup_instance):
 68 |     """
 69 |     fill blocked domain list
 70 |     :param soup_instance:
 71 |     :return: None
 72 |     """
 73 |     if soup_instance is None:
 74 |         raise TypeError
 75 |     entries = soup_instance.select('td[class="first"]')
 76 |     # table layout:
 77 |     #   domain-name  date  block-percent  tags
 78 |     for entry in entries:
 79 |         domainName = entry.text.strip()
 80 |         blockPercentElem = entry.next_sibling.next_sibling
 81 |         if "blocked" in blockPercentElem["class"]:
 82 |             # remove '%' sign
 83 |             blockPercent = int(blockPercentElem.text.strip()[:-1])
 84 |             domainDict[domainName] = blockPercent
 85 | 
 86 | 
 87 | def get_first_page_and_count(url, proxies):
 88 |     """
 89 |     get first page raw HTML and page Count
 90 |     :param url:
 91 |     :param proxies:
 92 |     :return: {rawHTML, pageCount, soupIns} or None if error occurred
 93 |     """
 94 |     try:
 95 |         print(f'handling {url} and get count')
 96 |         req = requests.get(url=url, proxies=proxies)
 97 |         if req.status_code != requests.codes.ok:
 98 |             return None
 99 |         pageRawHTML = req.text
100 |         mysoup = bs4.BeautifulSoup(pageRawHTML, features='lxml')
101 |         entry = mysoup.select('li[class="pager-last last"] a')
102 |         href = entry[0].attrs["href"]
103 |         total_page_count = int(hrefPageParamPattern.findall(href)[0])
104 |         print(f'{url} has pageCount {total_page_count} <= plus one to get actual page count')
105 |         return {'rawHTML': pageRawHTML, 'pageCount': total_page_count, 'soupIns': mysoup}
106 |     except:
107 |         traceback.print_exc(file=sys.stdout)
108 |         return None
109 | 
110 | 
111 | def get_page_content(url, pageIndex, proxies):
112 |     """
113 |     get the whole page via page index
114 |     :param url:
115 |     :param pageIndex:
116 |     :param proxies:
117 |     :return: BeautifulSoup Instance or None if error occurred
118 |     """
119 |     try:
120 |         print(f'handling {url} page {pageIndex}')
121 |         req = requests.get(url=url, params={"page": pageIndex}, proxies=proxies)
122 |         if req.status_code != requests.codes.ok:
123 |             return None
124 |         return bs4.BeautifulSoup(req.text, features='lxml')
125 |     except:
126 |         traceback.print_exc(file=sys.stdout)
127 |         return None
128 | 
129 | 
130 | def do_url(url, proxies, getAllPagesEvenIfWeHaveBigPageCount=DefaultGetAllPagesEvenIfWeHaveBigPageCount,
131 |            maxPageCount=DefaultMaxPageCount):
132 |     """
133 |     fetch, parse and populate blocked domain list
134 |     :param url:
135 |     :param proxies:
136 |     :return: None
137 |     """
138 |     tmpDict = get_first_page_and_count(url, proxies)
139 |     if tmpDict is None:
140 |         print("fail to get first page")
141 |         raise ConnectionError
142 |     last_page_num = tmpDict['pageCount']
143 |     total_page_count = last_page_num + 1
144 |     # the actual page one (it doesn't need ?page=<num> parameter)
145 |     populate_domain_blockPercent(tmpDict['soupIns'])
146 |     # get page range from the actual page two
147 |     if not getAllPagesEvenIfWeHaveBigPageCount and total_page_count > maxPageCount+1:
148 |         pageRange = range(1, maxPageCount+1)
149 |     else:
150 |         # only when we specify we should fetch all pages or we have only a few to fetch
151 |         pageRange = range(1, total_page_count)
152 |     for i in pageRange:
153 |         soup = get_page_content(url, i, proxies)
154 |         if soup is None:
155 |             print(f"fail to get {url} page {i}")
156 |             raise ConnectionError
157 |         populate_domain_blockPercent(soup)
158 | 
159 | 
160 | def is_valid_domain(domainStr):
161 |     try:
162 |         return domainPattern.fullmatch(domainStr) != None
163 |     except:
164 |         return False
165 | 
166 | 
167 | def write_file(content):
168 |     with open(DomainListFileName, mode='w', encoding='utf-8', newline='\n') as fd:
169 |         fd.write(content)
170 |         fd.flush()
171 | 
172 | 
173 | # ===============================================================
174 | # PROCEDURES
175 | # ===============================================================
176 | 
177 | if __name__ == '__main__':
178 |     try:
179 |         # download and populate mapping dictionary
180 |         do_url(AlexaTop1000URL, myProxies, getAllPagesEvenIfWeHaveBigPageCount=True)
181 |         do_url(DomainsURL, myProxies)
182 |         do_url(BlockedURL, myProxies, getAllPagesEvenIfWeHaveBigPageCount=True)
183 | 
184 |         # handle threshold
185 |         filteredDict = {domainName: blockPercent for domainName, blockPercent in domainDict.items() if
186 |                         blockPercent >= DefaultBlockThreshold}
187 | 
188 |         resultList = list(filteredDict.keys())
189 | 
190 |         # handle invalid domains
191 |         validDomainResultList = [item for item in resultList if is_valid_domain(item)]
192 | 
193 |         # to lower
194 |         lowerValidDomainResultList = [item.lower() for item in validDomainResultList]
195 | 
196 |         # write sorted file
197 |         write_file('\n'.join(sorted(lowerValidDomainResultList)))
198 |     except:
199 |         traceback.print_exc(file=sys.stdout)
200 | 


--------------------------------------------------------------------------------