├── .gitignore ├── .gitattributes ├── greatfire-importer.sh ├── find_redundant.py ├── updater.py ├── dnsmasq-conf-maker.sh └── greatfire-fetcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # generated file 2 | *.conf 3 | greatfire.* 4 | greatfire-domains* 5 | # temp files 6 | *~ 7 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text eol=lf 3 | -------------------------------------------------------------------------------- /greatfire-importer.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | Output="./greatfire-domains.txt" 3 | 4 | [ -f $Output ] && { 5 | cat $Output | xargs ./updater.py -a 6 | python3 ./find_redundant.py | xargs ./updater.py -d 7 | } 8 | 9 | -------------------------------------------------------------------------------- /find_redundant.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' Find domains.txt for redundant item. 4 | ''' 5 | 6 | LEAF = 1 7 | 8 | def main(): 9 | with open('domains.txt', 'r') as f: 10 | lines = f.readlines() 11 | 12 | # Parse conf file & prepare data structure 13 | data = {} 14 | for line in lines: 15 | if line == '' or line.startswith('#'): 16 | continue 17 | domain = line 18 | labels = domain.split('.') 19 | labels.reverse() 20 | data[domain] = labels 21 | domains = list(data.keys()) 22 | domains.sort(key=lambda k: len(data[k])) 23 | 24 | tree = {} 25 | for domain in domains: 26 | labels = data[domain] 27 | node = tree # Init current node with root node 28 | for i, label in enumerate(labels): 29 | isLastLabel = i + 1 == len(labels) 30 | # Check whether redundant 31 | if (node == LEAF) or (isLastLabel and label in node): 32 | print(f"{domain}") 33 | break 34 | # Create leaf node 35 | if isLastLabel: 36 | node[label] = LEAF 37 | break 38 | # Create branch node 39 | if label not in node: 40 | node[label] = {} 41 | # Iterate to child node 42 | node = node[label] 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /updater.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | from argparse import ArgumentParser 4 | 5 | if __name__ == "__main__": 6 | parser = ArgumentParser(description="domains updater from felixonmars") 7 | parser.add_argument( 8 | '-a', '--add', 9 | metavar="DOMAIN", 10 | nargs="+", 11 | help='Add one or more new domain(s) (implies -s)', 12 | ) 13 | parser.add_argument( 14 | '-d', '--delete', 15 | metavar="DOMAIN", 16 | nargs="+", 17 | help='Remove one or more old domain(s) (implies -s)', 18 | ) 19 | parser.add_argument( 20 | '-s', '--sort', 21 | action='store_true', 22 | default=True, 23 | help='Sort the list (default action)', 24 | ) 25 | parser.add_argument( 26 | '-f', '--file', 27 | nargs=1, 28 | default=["domains.txt"], 29 | help="Specify the file to update (domains.txt by default)", 30 | ) 31 | 32 | options = parser.parse_args() 33 | 34 | with open(options.file[0]) as f: 35 | lines = list(f) 36 | 37 | if options.add: 38 | options.sort = True 39 | 40 | for domain in options.add: 41 | new_line = "%s\n" % domain 42 | if new_line in lines: 43 | print("Domain already exists: " + domain) 44 | else: 45 | print("New domain added: " + domain) 46 | lines.append(new_line) 47 | 48 | if options.delete: 49 | options.sort = True 50 | 51 | for domain in options.delete: 52 | target_line = "%s\n" % domain 53 | if target_line not in lines: 54 | print("Failed to remove domain " + domain + ": not found.") 55 | else: 56 | print("Domain removed: " + domain) 57 | lines.remove(target_line) 58 | 59 | if options.sort: 60 | lines.sort(key=lambda x: x.lstrip("#")) 61 | 62 | with open(options.file[0], "w") as f: 63 | f.write(''.join(filter(lambda line: line.strip(), lines))) 64 | -------------------------------------------------------------------------------- /dnsmasq-conf-maker.sh: -------------------------------------------------------------------------------- 1 | ConfFile="./dnsmasq-blocklist.conf" 2 | DefaultDomainList="./domains.txt" 3 | DomainList="$1" 4 | 5 | [ -f "$DefaultDomainList" ] && DomainListExists=1 || DomainListExists=0 6 | 7 | [ "$1" == "" -a "$DomainListExists" == "0" ] && { 8 | echo "Notice: must cleanup old files before!" 9 | echo "the Default Domain List is $DefaultDomainList" 10 | echo "usage: $0 DOMAIN_LIST_FILE" 11 | exit 1 12 | } 13 | 14 | [ "$1" == "" -a "$DomainListExists" == "1" ] && { 15 | echo "using DefaultDomainList: $DefaultDomainList" 16 | DomainList=${DefaultDomainList} 17 | } 18 | 19 | rm -f $ConfFile 2>&1 20 | 21 | echo -n "Only need ipset list? [y/n]:" 22 | read NoServer 23 | [ "$NoServer" == "y" -o "$NoServer" == "Y" ] || { 24 | echo -n "Enter your dns server (syntax IP[#port] ):" 25 | read dnsserver 26 | [ "$dnsserver" == "" ] && { 27 | dnsserver="8.8.8.8" 28 | echo "dns server empty, using 8.8.8.8 instead" 29 | } 30 | # in case using ':' 31 | dnsserver=${dnsserver//:/#} 32 | } 33 | echo -n "Enter your ipset name:" 34 | read ipset 35 | [ "$ipset" == "" ] && { 36 | ipset="UNKNOWN_IPSET_NAME" 37 | echo "ipset name empty!!" 38 | } 39 | 40 | cat $DomainList | while read SingleDomain 41 | do 42 | echo "ipset=/$SingleDomain/${ipset}">>$ConfFile 43 | [ "$NoServer" == "y" -o "$NoServer" == "Y" ] || { 44 | echo "server=/$SingleDomain/${dnsserver}">>$ConfFile 45 | } 46 | done 47 | 48 | ## append special ipset to Conffile, we don't have to exclude dl.google.com anymore 49 | # this will not add domains to ipset list, pls refer to dnsmasq manpage 50 | echo "#### Bypass ipset domains">>$ConfFile 51 | #echo "ipset=/dl.google.com/#">>$ConfFile 52 | cat <<'EOF' | while read BypassIpsetDomain 53 | 265.com 54 | 2mdn.net 55 | app-measurement.com 56 | beacons.gcp.gvt2.com 57 | beacons.gvt2.com 58 | beacons3.gvt2.com 59 | c.admob.com 60 | c.android.clients.google.com 61 | cache.pack.google.com 62 | clientservices.googleapis.com 63 | connectivitycheck.gstatic.com 64 | csi.gstatic.com 65 | dl.google.com 66 | doubleclick.net 67 | e.admob.com 68 | fonts.googleapis.com 69 | fonts.gstatic.com 70 | google-analytics.com 71 | googleadservices.com 72 | googleanalytics.com 73 | googlesyndication.com 74 | googletagmanager.com 75 | googletagservices.com 76 | imasdk.googleapis.com 77 | kh.google.com 78 | khm.google.com 79 | khm.googleapis.com 80 | khm0.google.com 81 | khm0.googleapis.com 82 | khm1.google.com 83 | khm1.googleapis.com 84 | khm2.google.com 85 | khm2.googleapis.com 86 | khm3.google.com 87 | khm3.googleapis.com 88 | khmdb.google.com 89 | khmdb.googleapis.com 90 | media.admob.com 91 | mediavisor.doubleclick.com 92 | redirector.gvt1.com 93 | toolbarqueries.google.com 94 | update.googleapis.com 95 | EOF 96 | do 97 | echo "ipset=/$BypassIpsetDomain/#" >>$ConfFile 98 | done 99 | 100 | -------------------------------------------------------------------------------- /greatfire-fetcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Domain list fetcher from GreatFire Analyzer, use at your own risk 3 | # and modify parameters if needed 4 | 5 | # Use this script under clean DNS result and net tunnel 6 | 7 | # Copyright (c) 2019-2021 wongsyrone 8 | 9 | import sys 10 | import traceback 11 | import re 12 | import bs4 13 | import requests 14 | 15 | # https://zh.greatfire.org/analyzer 16 | # Examples: 17 | # https://zh.greatfire.org/search/alexa-top-1000-domains?page=8 18 | # https://zh.greatfire.org/search/domains?page=1091 19 | 20 | # =============================================================== 21 | # PARAMETERS 22 | # =============================================================== 23 | 24 | # Whether we should use proxy 25 | # Configure them if set to 'True' 26 | UseProxy = False 27 | 28 | # some default values can be overwritten 29 | 30 | # Big page count is determined by DefaultMaxPageCount 31 | DefaultGetAllPagesEvenIfWeHaveBigPageCount = False 32 | DefaultMaxPageCount = 100 33 | DefaultBlockThreshold = 80 34 | 35 | DomainListFileName = 'greatfire-domains.txt' 36 | 37 | if UseProxy: 38 | # Should install dependency to support SOCKS protocol 39 | myProxies = { 40 | 'http': 'http://127.0.0.1:1088', 41 | 'https': 'http://127.0.0.1:1088', 42 | } 43 | else: 44 | myProxies = None 45 | 46 | # =============================================================== 47 | # Do NOT change content below if you don't know the meaning 48 | # =============================================================== 49 | AlexaTop1000URL = 'https://zh.greatfire.org/search/alexa-top-1000-domains' 50 | DomainsURL = 'https://zh.greatfire.org/search/domains' 51 | BlockedURL = 'https://zh.greatfire.org/search/blocked' 52 | 53 | domainPattern = re.compile( 54 | r'^(:?(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|' # domain pt.1 55 | r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|' # domain pt.2 56 | r'([a-zA-Z0-9][-_a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.)+' # domain pt.3 57 | r'([a-zA-Z]{2,13}|(xn--[a-zA-Z0-9]{2,30}))$' # TLD 58 | ) 59 | 60 | # for page count 61 | # max length of digits from backwards 62 | hrefPageParamPattern = re.compile(r'(?<=page=)\d+') 63 | 64 | domainDict = {} 65 | 66 | 67 | def populate_domain_blockPercent(soup_instance): 68 | """ 69 | fill blocked domain list 70 | :param soup_instance: 71 | :return: None 72 | """ 73 | if soup_instance is None: 74 | raise TypeError 75 | entries = soup_instance.select('td[class="first"]') 76 | # table layout: 77 | # domain-name date block-percent tags 78 | for entry in entries: 79 | domainName = entry.text.strip() 80 | blockPercentElem = entry.next_sibling.next_sibling 81 | if "blocked" in blockPercentElem["class"]: 82 | # remove '%' sign 83 | blockPercent = int(blockPercentElem.text.strip()[:-1]) 84 | domainDict[domainName] = blockPercent 85 | 86 | 87 | def get_first_page_and_count(url, proxies): 88 | """ 89 | get first page raw HTML and page Count 90 | :param url: 91 | :param proxies: 92 | :return: {rawHTML, pageCount, soupIns} or None if error occurred 93 | """ 94 | try: 95 | print(f'handling {url} and get count') 96 | req = requests.get(url=url, proxies=proxies) 97 | if req.status_code != requests.codes.ok: 98 | return None 99 | pageRawHTML = req.text 100 | mysoup = bs4.BeautifulSoup(pageRawHTML, features='lxml') 101 | entry = mysoup.select('li[class="pager-last last"] a') 102 | href = entry[0].attrs["href"] 103 | total_page_count = int(hrefPageParamPattern.findall(href)[0]) 104 | print(f'{url} has pageCount {total_page_count} <= plus one to get actual page count') 105 | return {'rawHTML': pageRawHTML, 'pageCount': total_page_count, 'soupIns': mysoup} 106 | except: 107 | traceback.print_exc(file=sys.stdout) 108 | return None 109 | 110 | 111 | def get_page_content(url, pageIndex, proxies): 112 | """ 113 | get the whole page via page index 114 | :param url: 115 | :param pageIndex: 116 | :param proxies: 117 | :return: BeautifulSoup Instance or None if error occurred 118 | """ 119 | try: 120 | print(f'handling {url} page {pageIndex}') 121 | req = requests.get(url=url, params={"page": pageIndex}, proxies=proxies) 122 | if req.status_code != requests.codes.ok: 123 | return None 124 | return bs4.BeautifulSoup(req.text, features='lxml') 125 | except: 126 | traceback.print_exc(file=sys.stdout) 127 | return None 128 | 129 | 130 | def do_url(url, proxies, getAllPagesEvenIfWeHaveBigPageCount=DefaultGetAllPagesEvenIfWeHaveBigPageCount, 131 | maxPageCount=DefaultMaxPageCount): 132 | """ 133 | fetch, parse and populate blocked domain list 134 | :param url: 135 | :param proxies: 136 | :return: None 137 | """ 138 | tmpDict = get_first_page_and_count(url, proxies) 139 | if tmpDict is None: 140 | print("fail to get first page") 141 | raise ConnectionError 142 | last_page_num = tmpDict['pageCount'] 143 | total_page_count = last_page_num + 1 144 | # the actual page one (it doesn't need ?page= parameter) 145 | populate_domain_blockPercent(tmpDict['soupIns']) 146 | # get page range from the actual page two 147 | if not getAllPagesEvenIfWeHaveBigPageCount and total_page_count > maxPageCount+1: 148 | pageRange = range(1, maxPageCount+1) 149 | else: 150 | # only when we specify we should fetch all pages or we have only a few to fetch 151 | pageRange = range(1, total_page_count) 152 | for i in pageRange: 153 | soup = get_page_content(url, i, proxies) 154 | if soup is None: 155 | print(f"fail to get {url} page {i}") 156 | raise ConnectionError 157 | populate_domain_blockPercent(soup) 158 | 159 | 160 | def is_valid_domain(domainStr): 161 | try: 162 | return domainPattern.fullmatch(domainStr) != None 163 | except: 164 | return False 165 | 166 | 167 | def write_file(content): 168 | with open(DomainListFileName, mode='w', encoding='utf-8', newline='\n') as fd: 169 | fd.write(content) 170 | fd.flush() 171 | 172 | 173 | # =============================================================== 174 | # PROCEDURES 175 | # =============================================================== 176 | 177 | if __name__ == '__main__': 178 | try: 179 | # download and populate mapping dictionary 180 | do_url(AlexaTop1000URL, myProxies, getAllPagesEvenIfWeHaveBigPageCount=True) 181 | do_url(DomainsURL, myProxies) 182 | do_url(BlockedURL, myProxies, getAllPagesEvenIfWeHaveBigPageCount=True) 183 | 184 | # handle threshold 185 | filteredDict = {domainName: blockPercent for domainName, blockPercent in domainDict.items() if 186 | blockPercent >= DefaultBlockThreshold} 187 | 188 | resultList = list(filteredDict.keys()) 189 | 190 | # handle invalid domains 191 | validDomainResultList = [item for item in resultList if is_valid_domain(item)] 192 | 193 | # to lower 194 | lowerValidDomainResultList = [item.lower() for item in validDomainResultList] 195 | 196 | # write sorted file 197 | write_file('\n'.join(sorted(lowerValidDomainResultList))) 198 | except: 199 | traceback.print_exc(file=sys.stdout) 200 | --------------------------------------------------------------------------------