├── .gitignore ├── LICENSE ├── README.md └── https.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Go template 3 | # Binaries for programs and plugins 4 | *.exe 5 | *.dll 6 | *.so 7 | *.dylib 8 | 9 | # Test binary, build with `go test -c` 10 | *.test 11 | 12 | # Output of the go coverage tool, specifically when used with LiteIDE 13 | *.out 14 | 15 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 16 | .glide/ 17 | 18 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Go HTTPS Everywhere 2 | =================== 3 | 4 | [![GoDoc](https://godoc.org/github.com/tenta-browser/go-https-everywhere?status.svg)](https://godoc.org/github.com/tenta-browser/go-https-everywhere) 5 | 6 | [HTTPS Everywhere](https://github.com/EFForg/https-everywhere) rewrite engine implementation in Golang. 7 | 8 | Contains exports for both compressed ruleset construction, client-side read-only use, 9 | and the reconstruction into memory of compressed rules, and finally the actual intended URL rewrite logic. Matching and rewrite operations 10 | use a [regex interface bridge](https://github.com/tenta-browser/go-pcre-matcher) package, which can be implemented in the target 11 | environment. 12 | 13 | Currently missing the cookie secure flag feature which will be in a future iteration. 14 | 15 | Contact: developer@tenta.io 16 | 17 | Installation 18 | ============ 19 | 20 | 1. `go get github.com/tenta-browser/go-https-everywhere` 21 | 22 | API 23 | === 24 | 25 | * `Parse()`: Reads, and constructs the rulesets into memory 26 | * `Encode()`/`Decode()`/`EncodeToPath()`: Handles encode and decode operations 27 | * `TryRewrite()`: Searches and (if applicable) rewrites the input url according to the rewrite rules 28 | * `ShowStats()`: Prints a line of encoding statistics 29 | 30 | License 31 | ======= 32 | 33 | Licensed under the Apache License, Version 2.0 (the "License"); 34 | you may not use this file except in compliance with the License. 35 | You may obtain a copy of the License at 36 | 37 | http://www.apache.org/licenses/LICENSE-2.0 38 | 39 | Unless required by applicable law or agreed to in writing, software 40 | distributed under the License is distributed on an "AS IS" BASIS, 41 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 42 | See the License for the specific language governing permissions and 43 | limitations under the License. 44 | 45 | For any questions, please contact developer@tenta.io 46 | 47 | Contributing 48 | ============ 49 | 50 | We welcome contributions, feedback and plain old complaining. Feel free to open 51 | an issue or shoot us a message to developer@tenta.io. If you'd like to contribute, 52 | please open a pull request and send us an email to sign a contributor agreement. 53 | 54 | About the EFF 55 | ============= 56 | 57 | HTTPS Everywhere is a project of the Electronic Frontier Foundation. 58 | 59 | The Electronic Frontier Foundation is the leading nonprofit organization defending civil liberties in the digital world. Founded in 1990, EFF champions user privacy, free expression, and innovation through impact litigation, policy analysis, grassroots activism, and technology development. 60 | 61 | [Support the EFF and HTTPS Everywhere](https://supporters.eff.org/donate/support-https-everywhere) 62 | 63 | About Tenta 64 | =========== 65 | 66 | This HTTPS Everywhere Library is brought to you by Team Tenta. Tenta is your [private, encrypted browser](https://tenta.com) that protects your data instead of selling. We're building a next-generation browser that combines all the privacy tools you need, including built-in OpenVPN. Everything is encrypted by default. That means your bookmarks, saved tabs, web history, web traffic, downloaded files, IP address and DNS. A truly incognito browser that's fast and easy. 67 | -------------------------------------------------------------------------------- /https.go: -------------------------------------------------------------------------------- 1 | /** 2 | * Go HTTPS Everywhere 3 | * 4 | * Copyright 2017 Tenta, LLC 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * For any questions, please contact developer@tenta.io 19 | * 20 | * https.go: Go HTTPS Everywhere Engine 21 | */ 22 | 23 | // A golang implementation of the EFF's HTTPS Anywhere 24 | // This source contains exports for both _server_ and client-side use, as in, the construction of the ruleset in proprietary format 25 | // and the reconstruction into memory, and actual intended URL rewrite logic 26 | // The general approach is to teach a filter about the simple target hosts (no wildcard, or one wildcard at one of the endings of the pattern), 27 | // this will block the vast majority of sites (with testing time well below the millisecond), complicated wildcards will be saved in a map, 28 | // and transformed into a pcre regex pattern (the * token will be expanded into [^\.]+) and precompiled for speed (right now there are 22 such cases), 29 | // and tried by matching the input versus the compiled regex; lastly there's a hashmap (which has indices the 32bit hash of the target string representation, 30 | //values, the associated rulesets) which will load the available rules to apply, this is a filtering and retrieval logic. 31 | // Upon generating the structure hash collisions are handled by evacuating the colliding entry into the forward structure. 32 | // A flow of rewrite is as follows: 33 | // 1. try to match input to forward map regexes 34 | // 1.1 if match occurs, apply the first rule that fits (return if url is excluded), return the resulting url 35 | // 2. try url in filter, if is not found (not, there is 0% false negatives) return 36 | // 3. find the asociated rulesets with combinations {url, url_first_subdomain_wildcarded, url_tld_wildcarded} 37 | // (Example: input = somesubdomain.example.com -> {somesubdomain.example.com, *.example.com, somesubdomain.example.*}) 38 | // 3.1 if there's a match, apply the first rule that fits (return if url is excluded), and return the new url 39 | // Encoding takes the structures and serializes them in a space optimized format, cuckoofilter has already an encode implemented, slice is encoded in a straightforward manner, 40 | // regularMap (aka map[uint32][]int, aka hash(url)->array(of_applicable_rulesets)) needs an extra step, since the unique values are around 5K (the `int`s from all the `[]int`s), 41 | // the implementation is to flip the map and try to encode a [][]uint32 42 | // where the index of the first dimension is the value from the map, and the second index is the order of occurence of the hash, and finally the uint32 values are the hashes 43 | // Exported functions: 44 | // Parse -- reads the rules from the given path, and constructs the appropriate structures resulting in a HtEvSt or an error 45 | // Encode/EncodeToPath/Decode -- as their name suggests handles encoding and decoding of the structure, EncodeToPath flushes the compressed format to a specified file. 46 | // TryRewrite -- searches for and applies the appropriate rewrite rules, returns the new url (or old one if no match occured) or an error 47 | // ShowStats -- prints the statistics of the structure in memory 48 | package https 49 | 50 | import ( 51 | "bytes" 52 | "encoding/xml" 53 | "fmt" 54 | "hash/fnv" 55 | "io/ioutil" 56 | "math" 57 | "strings" 58 | "time" 59 | 60 | "github.com/seiflotfy/cuckoofilter" 61 | serialize "github.com/tenta-browser/go-bitstream-ops" 62 | goutils "github.com/tenta-browser/go-pcre-matcher" 63 | "golang.org/x/net/publicsuffix" 64 | ) 65 | 66 | // LogEnabled controls whether a log is written 67 | var LogEnabled = true 68 | 69 | /// cannot const the slice 70 | var possibleUintDelimiter = map[uint32]bool{0xfadea3a1: true, 0xbaadface: true, 0xfefefefe: true, 0x0dabbee2: true, 0xffffffff: true} 71 | var chosenDelimiter uint32 72 | var encodeBits = 16 73 | var encodeMap map[int]int 74 | 75 | // RuleSt - structure used to import rewrite data from xml 76 | type RuleSt struct { 77 | From string `xml:"from,attr"` 78 | To string `xml:"to,attr"` 79 | } 80 | 81 | // TargetSt - structure used to import target data from xml 82 | type TargetSt struct { 83 | Host string `xml:"host,attr"` 84 | } 85 | 86 | // ExclusionSt - structure used to import exclusion data from xml 87 | type ExclusionSt struct { 88 | Pattern string `xml:"pattern,attr"` 89 | } 90 | 91 | // TestSt - structure used to import testing data from xml 92 | type TestSt struct { 93 | URL string `xml:"url,attr"` 94 | } 95 | 96 | // RulesetSt - represents an xml rule file 97 | type RulesetSt struct { 98 | Index int 99 | Name string `xml:"name,attr"` 100 | Disabled string `xml:"default_off,attr"` 101 | Platform string `xml:"platform,attr"` 102 | Target []TargetSt `xml:"target"` 103 | Rule []RuleSt `xml:"rule"` 104 | Exclusion []ExclusionSt `xml:"exclusion"` 105 | Test []TestSt `xml:"test"` 106 | } 107 | 108 | // SimplifiedRulesetSt - a rule file, holding only necessary data 109 | type SimplifiedRulesetSt struct { 110 | exclusion, ruleFrom, ruleTo []string 111 | } 112 | 113 | // HtEvSt - internal representation of the input rulesets 114 | type HtEvSt struct { 115 | filterBytesNum, forwardBytesNum int 116 | input []*RulesetSt 117 | filter *cuckoofilter.CuckooFilter 118 | forward map[string][]*RulesetSt 119 | optimizedForward map[goutils.Regexp][]int 120 | regularMap map[uint32][]int 121 | regularSlice []*SimplifiedRulesetSt 122 | } 123 | 124 | // RuleIndex - used for indexing the RulesetSt's as they are generated 125 | var RuleIndex int 126 | 127 | func tokenizeURL(in string) (scheme, domain, site string, subdomain []string, e error) { 128 | /// detach the scheme part 129 | 130 | if strings.HasPrefix(in, "http://") { 131 | scheme = "http" 132 | in = in[7:] 133 | } else { 134 | e = fmt.Errorf("protocol is not supported") 135 | return 136 | } 137 | /// now detach the site part, or rather anything, that comes after the `/' token 138 | si := strings.Index(in, "/") 139 | if si > -1 { 140 | site = in[si+1:] 141 | in = in[:si] 142 | } 143 | 144 | domain, e = publicsuffix.EffectiveTLDPlusOne(in) 145 | if e != nil { 146 | e = fmt.Errorf("publicsuffix error [%s]", e.Error()) 147 | return 148 | } 149 | 150 | si = strings.Index(in, domain) 151 | if si > 0 { 152 | subdomain = strings.Split(in[:si-1], ".") 153 | } 154 | 155 | return 156 | } 157 | 158 | /// search for the linked rule structure in the standard fashion: forward map, by regex, filter and regular map 159 | func (h *HtEvSt) search(t string) (ruleInd []int, e error) { 160 | 161 | _, domain, _, subdomain, e := tokenizeURL(t) 162 | if e != nil { 163 | return nil, fmt.Errorf("cannot tokenize [%s]", e.Error()) 164 | } 165 | 166 | orig := append(subdomain, strings.Split(domain, ".")...) 167 | origLen := len(orig) 168 | variations := []string{strings.Join(orig, "."), strings.Join(append(orig[:origLen-1], "*"), ".")} 169 | /// wildcard subdomains only if they exist in the url 170 | if subdomain != nil { 171 | variations = append(variations, strings.Join(append([]string{"*"}, variations[0]), "."), strings.Join(append([]string{"*"}, orig[1:]...), ".")) 172 | } 173 | /// first check forward, will do so once pcre support is established 174 | /// forward now has keys as regexes, so we'll compile and try to apply them (will do a precompile pass later, in the decode phase) 175 | 176 | for k, v := range h.optimizedForward { 177 | if m := k.Search(variations[0]); m != nil && m.GroupPresentByIdx(0) == true { 178 | if LogEnabled { 179 | fmt.Printf("Rule struct found via forward.\n") 180 | } 181 | if ruleInd == nil { 182 | ruleInd = make([]int, 0) 183 | } 184 | ruleInd = append(ruleInd, v...) 185 | } 186 | } 187 | if LogEnabled { 188 | fmt.Printf("Searching using [%v]\n", variations) 189 | } 190 | /// next, check filter and map 191 | for _, v := range variations { 192 | if h.filter.Lookup([]byte(v)) { 193 | if ind, contains := h.regularMap[hash(v)]; contains { 194 | if LogEnabled { 195 | fmt.Printf("Rule struct found via filter+regular.\n") 196 | } 197 | if ruleInd == nil { 198 | ruleInd = make([]int, 0) 199 | } 200 | ruleInd = append(ruleInd, ind...) 201 | 202 | } 203 | } 204 | } 205 | 206 | return ruleInd, nil 207 | } 208 | 209 | // TryRewrite - exported function which finds rule struct (if applicable), and applies the (most) appropriate rewrite rule 210 | // okay, so, tri-state return: problem -> e != nil, no match -> out == "" && e == nil, match -> out != "" && e != nil 211 | func (h *HtEvSt) TryRewrite(in string) (out string, e error) { 212 | out = in 213 | start := time.Now() 214 | ruleIndices, e := h.search(in) 215 | if e != nil { 216 | return "", fmt.Errorf("search error [%s]", e.Error()) 217 | } 218 | if ruleIndices == nil { 219 | return 220 | } 221 | if LogEnabled { 222 | fmt.Printf("Search yielded for [%s] %d rule sets\n", in, len(ruleIndices)) 223 | } 224 | 225 | /// here comes another batch of pcre-dependent codes 226 | for _, ri := range ruleIndices { 227 | rule := h.regularSlice[ri] 228 | needsToContinue := false 229 | for _, excl := range rule.exclusion { 230 | re, e := goutils.ReEngine.Compile(excl, 0) 231 | if e != nil { 232 | return "", fmt.Errorf("cannot compile exclusion [%s]", e.Error()) 233 | } 234 | /// try matching the exclusions 235 | if m := re.Search(in); m != nil && m.GroupPresentByIdx(0) == true { 236 | if LogEnabled { 237 | fmt.Printf("Input [%s] excluded via pattern [%s].\n", in, excl) 238 | } 239 | needsToContinue = true 240 | break 241 | } 242 | } 243 | if needsToContinue { 244 | continue 245 | } 246 | /// by getting this far, means we have to find a rule for our input (if not, it's a https-everywhere rule collection miss (theoretically, at least)) 247 | for i, rewrite := range rule.ruleFrom { 248 | re, e := goutils.ReEngine.Compile(rewrite, 0) 249 | if e != nil { 250 | return "", fmt.Errorf("cannot compile rewrite [%s]", e.Error()) 251 | } 252 | 253 | if m := re.Search(in); m != nil && m.GroupPresentByIdx(0) == true { 254 | if LogEnabled { 255 | fmt.Printf("Input [%s] matching rewrite pattern [%s].\n", in, rewrite) 256 | } 257 | out := re.Replace(in, rule.ruleTo[i]) 258 | if LogEnabled { 259 | fmt.Printf("Rewrote to [%s]\n", out) 260 | fmt.Printf("Search+Rewrite took [%v] time.\n", time.Now().Sub(start)) 261 | } 262 | return out, nil 263 | } 264 | } 265 | } 266 | ///in a first run let's handle as non-error the case when no rewrite rule could be found (but targets match) 267 | return "", nil 268 | } 269 | 270 | func (h *HtEvSt) newRulesetSt() (r *RulesetSt) { 271 | r = &RulesetSt{Index: RuleIndex} 272 | RuleIndex++ 273 | h.input = append(h.input, r) 274 | if len(h.input) != RuleIndex { 275 | panic(fmt.Sprintf("index mismatch")) 276 | } 277 | return 278 | } 279 | 280 | func newSimplifiedRulesetSt() *SimplifiedRulesetSt { 281 | //return &SimplifiedRulesetSt{make([]string, 0), make([]string, 0), make([]string, 0)} 282 | return &SimplifiedRulesetSt{} 283 | } 284 | 285 | /// decodes forward map into reconstructedForward; does not contain byte length uint32 286 | func decodeForwardMap(b []byte) (m map[goutils.Regexp][]int, e error) { 287 | r := serialize.NewBitStreamOpsReader(b) 288 | m = make(map[goutils.Regexp][]int) 289 | var temp uint 290 | var temps string 291 | /// straight _forward_ loop 292 | for r.HasMoreBytes() { 293 | if temp, e = r.Collect(32); e != nil { 294 | return nil, fmt.Errorf("error collecting str length [%s]", e.Error()) 295 | } 296 | if temps, e = r.DeConcat(int(temp)); e != nil { 297 | return nil, fmt.Errorf("error collecting string [%s]", e.Error()) 298 | } 299 | if temp, e = r.Collect(32); e != nil { 300 | return nil, fmt.Errorf("error collecting slice index [%s]", e.Error()) 301 | } 302 | /// to be able to look them up using regex (does not exactly validate for valid domain composition) 303 | re, e := goutils.ReEngine.Compile(strings.Replace(temps, "*", "[\\w-]+", -1), 0) 304 | if e != nil { 305 | return nil, fmt.Errorf("cannot compile wildcard domain in forward map") 306 | } 307 | if m[re] == nil { 308 | m[re] = make([]int, 0) 309 | } 310 | m[re] = append(m[re], int(temp)) 311 | } 312 | 313 | return 314 | } 315 | 316 | /// encodes forward map (a formality really, since around 20 entries here), references indices from regularSlice 317 | /// byte length will be added further up in the callstack 318 | func encodeForwardMap(h *HtEvSt) (ret []byte, e error) { 319 | b := serialize.NewBitStreamOps() 320 | /// no need to add length param here as it does not speedup decode 321 | /// basic scheme: ([strlen][str][index]){len(h.forward)} 322 | for target, ruleArr := range h.forward { 323 | for _, rule := range ruleArr { 324 | if e = b.Emit(uint(len(target)), 32); e != nil { 325 | return nil, fmt.Errorf("error emitting str length [%s]", e.Error()) 326 | } 327 | b.Concat(target) 328 | if e = b.Emit(uint(encodeMap[rule.Index]), 32); e != nil { 329 | return nil, fmt.Errorf("error emitting index [%s]", e.Error()) 330 | } 331 | } 332 | } 333 | return b.Buffer(), nil 334 | } 335 | 336 | /// some of these functions are wrappers 337 | /// does not include byte slice length uint 338 | func decodeRegularSlice(b []byte) ([]*SimplifiedRulesetSt, error) { 339 | r := serialize.NewBitStreamOpsReader(b) 340 | var sliceLen, elemLen uint 341 | var elem []byte 342 | var e error 343 | if sliceLen, e = r.Collect(32); e != nil { 344 | return nil, fmt.Errorf("decodeRegularSlice error [%s]", e.Error()) 345 | } 346 | ret := make([]*SimplifiedRulesetSt, int(sliceLen)) 347 | for i := 0; i < int(sliceLen); i++ { 348 | if elemLen, e = r.Collect(32); e != nil { 349 | return nil, fmt.Errorf("decodeRegularSlice elem size error [%s]", e.Error()) 350 | } 351 | if elem, e = r.DeAppend(int(elemLen)); e != nil { 352 | return nil, fmt.Errorf("decodeRegularSlice elem error [%s]", e.Error()) 353 | } 354 | ret[i] = newSimplifiedRulesetSt() 355 | if e = ret[i].decode(elem); e != nil { 356 | return nil, e 357 | } 358 | } 359 | 360 | return ret, e 361 | } 362 | 363 | /// does not include whole encoding length (part of why it's delegated to a function) 364 | func encodeRegularSlice(r []*SimplifiedRulesetSt) (ret []byte, e error) { 365 | b := serialize.NewBitStreamOps() 366 | b.Emit(uint(len(r)), 32) 367 | for _, sr := range r { 368 | t := sr.encode() 369 | b.Append(t) 370 | } 371 | ret = b.Buffer() 372 | return 373 | } 374 | 375 | /// decoding of the regular map. (iterative construction) 376 | func decodeRegularMap(b []byte) (m map[uint32][]int, e error) { 377 | /// byte slice comes without the length declaration (it's used further up in the stack) 378 | r := serialize.NewBitStreamOpsReader(b) 379 | m = make(map[uint32][]int) 380 | var delimiter, temp uint 381 | 382 | if delimiter, e = r.Collect(32); e != nil { 383 | return nil, fmt.Errorf("cannot collect delimiter for regular map [%s]", e.Error()) 384 | } 385 | 386 | runningIndex := 0 387 | for r.HasMoreBytes() { 388 | if temp, e = r.Collect(32); e != nil { 389 | return nil, fmt.Errorf("cannot collect temp value for regular map [%s]", e.Error()) 390 | } 391 | 392 | /// check if we need to increment `index` 393 | if temp == delimiter { 394 | runningIndex++ 395 | } else { /// or we can save temp as key 396 | if m[uint32(temp)] == nil { 397 | m[uint32(temp)] = make([]int, 0) 398 | } 399 | m[uint32(temp)] = append(m[uint32(temp)], runningIndex) 400 | } 401 | } 402 | 403 | // if logEnabled { 404 | // fmt.Printf("Reconstructed regular map. it has dimension of [%d]\n", len(m)) 405 | // } 406 | return 407 | } 408 | 409 | /// encoding of the regular map. this one is tricky, for in-depth process and design choices, consult paragraph from beginning of file 410 | func encodeRegularMap(h *HtEvSt) (ret []byte, e error) { 411 | b := serialize.NewBitStreamOps() 412 | /// gets the number of possible indices in the map (the values) 413 | numIndices := len(h.regularSlice) 414 | /// allocate a temporary triaging slice, for inverting the map (key-value-wise) 415 | temp := make([][]uint32, numIndices) 416 | /// k is the 32bit hash of the target, v is the index in the regularSlice 417 | for k, vArr := range h.regularMap { 418 | for _, v := range vArr { 419 | /// also check for the delimiter 420 | if _, contains := possibleUintDelimiter[k]; contains { 421 | possibleUintDelimiter[k] = false 422 | hasAtLeastOne := false 423 | for _, validDelimiter := range possibleUintDelimiter { 424 | if validDelimiter { 425 | hasAtLeastOne = true 426 | break 427 | } 428 | } 429 | if !hasAtLeastOne { 430 | panic("Exhausted delimiter options.\n") 431 | } 432 | } 433 | 434 | if temp[v] == nil { 435 | temp[v] = make([]uint32, 0) 436 | } 437 | temp[v] = append(temp[v], k) 438 | } 439 | } 440 | /// let's settle on the delimiter 441 | var delimiter uint32 442 | for del, valid := range possibleUintDelimiter { 443 | if valid { 444 | delimiter = del 445 | } 446 | } 447 | /// okay, now we can finally encode the flipped map 448 | /// we write the delimiter (as length will be written up in the stack) 449 | if e = b.Emit(uint(delimiter), 32); e != nil { 450 | return nil, fmt.Errorf("cannot emit delimiter for regular map [%s]", e.Error()) 451 | } 452 | /// now iterate over the temp slice 453 | for i, uints := range temp { 454 | /// emitting each hash value referring to this index 455 | for _, anuint := range uints { 456 | if e = b.Emit(uint(anuint), 32); e != nil { 457 | return nil, fmt.Errorf("cannot emit uint32 value regular map [%s]", e.Error()) 458 | } 459 | } 460 | /// emitting index incrementing delimiter 461 | if e = b.Emit(uint(delimiter), 32); e != nil { 462 | return nil, fmt.Errorf("cannot emit delimiter for regular map [%d/%d] [%s]", i, len(temp), e.Error()) 463 | } 464 | } 465 | return b.Buffer(), e 466 | } 467 | 468 | /// input byte slice without the leading uint 469 | func (s *SimplifiedRulesetSt) decode(b []byte) error { 470 | if s.exclusion != nil || s.ruleFrom != nil || s.ruleTo != nil { 471 | return fmt.Errorf("not a blank structure") 472 | } 473 | var exclSize, fromtoSize, currStrlen uint 474 | var e error 475 | r := serialize.NewBitStreamOpsReader(b) 476 | if exclSize, e = r.Collect(encodeBits); e != nil { 477 | return fmt.Errorf("decode error [%s]", e.Error()) 478 | } 479 | s.exclusion = make([]string, exclSize) 480 | for i := 0; i < int(exclSize); i++ { 481 | if currStrlen, e = r.Collect(encodeBits); e != nil { 482 | return fmt.Errorf("decode error [%s]", e.Error()) 483 | } 484 | if s.exclusion[i], e = r.DeConcat(int(currStrlen)); e != nil { 485 | return fmt.Errorf("deconcat error [%s]", e.Error()) 486 | } 487 | } 488 | if fromtoSize, e = r.Collect(encodeBits); e != nil { 489 | return fmt.Errorf("decode error [%s]", e.Error()) 490 | } 491 | s.ruleFrom = make([]string, fromtoSize) 492 | s.ruleTo = make([]string, fromtoSize) 493 | for i := 0; i < int(fromtoSize); i++ { 494 | if currStrlen, e = r.Collect(encodeBits); e != nil { 495 | return fmt.Errorf("decode error [%s]", e.Error()) 496 | } 497 | 498 | if s.ruleFrom[i], e = r.DeConcat(int(currStrlen)); e != nil { 499 | return fmt.Errorf("deconcat error [%s]", e.Error()) 500 | } 501 | if currStrlen, e = r.Collect(encodeBits); e != nil { 502 | return fmt.Errorf("decode error [%s]", e.Error()) 503 | } 504 | 505 | if s.ruleTo[i], e = r.DeConcat(int(currStrlen)); e != nil { 506 | return fmt.Errorf("deconcat error [%s]", e.Error()) 507 | } 508 | } 509 | return nil 510 | } 511 | 512 | /// emits an uint at the end which declares the byte slice length 513 | func (s *SimplifiedRulesetSt) encode() []byte { 514 | b := serialize.NewBitStreamOps() 515 | /// without this here uint32 516 | overallByteNum := (encodeBits/8)*(len(s.exclusion)+2*len(s.ruleFrom)+2) + s.countChars() 517 | b.Emit(uint(overallByteNum), 32) 518 | b.Emit(uint(len(s.exclusion)), encodeBits) 519 | for _, e := range s.exclusion { 520 | b.Emit(uint(len(e)), encodeBits) 521 | if len(e) > int(math.Pow(2, float64(encodeBits))-1) { 522 | panic(fmt.Sprintf("Size limitation exceeded. [excl] [%d][%s]\n", len(e), e)) 523 | } 524 | b.Concat(e) 525 | } 526 | 527 | b.Emit(uint(len(s.ruleFrom)), encodeBits) 528 | for i, e := range s.ruleFrom { 529 | b.Emit(uint(len(e)), encodeBits) 530 | b.Concat(e) 531 | if len(e) > int(math.Pow(2, float64(encodeBits))-1) { 532 | panic(fmt.Sprintf("Size limitation exceeded. [from] [%d][%s]\n", len(e), e)) 533 | } 534 | b.Emit(uint(len(s.ruleTo[i])), encodeBits) 535 | b.Concat(s.ruleTo[i]) 536 | if len(e) > int(math.Pow(2, float64(encodeBits))-1) { 537 | panic(fmt.Sprintf("Size limitation exceeded. [to] [%d][%s]\n", len(e), e)) 538 | } 539 | 540 | } 541 | return b.Buffer() 542 | } 543 | 544 | /// shortcut to check for the most common rule 545 | func (s *SimplifiedRulesetSt) isDefaultRule() bool { 546 | if len(s.exclusion) == 0 && len(s.ruleFrom) == 1 && len(s.ruleTo) == 1 && s.ruleFrom[0] == "^http:" && s.ruleTo[0] == "https:" { 547 | return true 548 | } 549 | return false 550 | } 551 | 552 | func (s *SimplifiedRulesetSt) countChars() (n int) { 553 | a := [][]string{s.exclusion, s.ruleFrom, s.ruleTo} 554 | for _, z := range a { 555 | for _, str := range z { 556 | n += len(str) 557 | } 558 | } 559 | return 560 | } 561 | 562 | func (s *SimplifiedRulesetSt) String() (z string) { 563 | z += fmt.Sprintf("E:") 564 | for _, e := range s.exclusion { 565 | z += fmt.Sprintf("[%s]", e) 566 | } 567 | z += fmt.Sprintf(" P:") 568 | for i, e := range s.ruleFrom { 569 | z += fmt.Sprintf("[%s->%s]", e, s.ruleTo[i]) 570 | } 571 | return 572 | } 573 | 574 | func (r *RulesetSt) String() (s string) { 575 | s = fmt.Sprintf("\tName [%s]\n", r.Name) 576 | s += fmt.Sprintf("\tTargets: ") 577 | for _, e := range r.Target { 578 | s += fmt.Sprintf("[%s]", e.Host) 579 | } 580 | s += fmt.Sprintf("\n") 581 | s += fmt.Sprintf("\tExclusions: ") 582 | for _, e := range r.Exclusion { 583 | s += fmt.Sprintf("[%s]", e.Pattern) 584 | } 585 | s += fmt.Sprintf("\n") 586 | s += fmt.Sprintf("\tRules: ") 587 | for _, e := range r.Rule { 588 | s += fmt.Sprintf("[%s->%s]", e.From, e.To) 589 | } 590 | s += fmt.Sprintf("\n") 591 | return 592 | } 593 | 594 | func (r *RulesetSt) countChars() (n int) { 595 | for _, e := range r.Exclusion { 596 | n += len(e.Pattern) 597 | } 598 | for _, e := range r.Target { 599 | n += len(e.Host) 600 | } 601 | for _, e := range r.Rule { 602 | n += len(e.From) + len(e.To) 603 | } 604 | return 605 | } 606 | 607 | func (r *RulesetSt) simplify() (s *SimplifiedRulesetSt) { 608 | s = &SimplifiedRulesetSt{make([]string, 0), make([]string, 0), make([]string, 0)} 609 | 610 | for _, e := range r.Exclusion { 611 | s.exclusion = append(s.exclusion, e.Pattern) 612 | } 613 | for _, rl := range r.Rule { 614 | s.ruleFrom = append(s.ruleFrom, rl.From) 615 | s.ruleTo = append(s.ruleTo, rl.To) 616 | } 617 | 618 | return 619 | } 620 | 621 | // ShowStats - prints a statistics line about the internal structure 622 | func (h *HtEvSt) ShowStats() { 623 | var cnt int 624 | for _, r := range h.regularSlice { 625 | cnt += r.countChars() 626 | } 627 | if LogEnabled { 628 | fmt.Printf("We have filter [%d], slice [%d], map [%d], forward [%d]. Chars [%d]\n", h.filter.Count(), len(h.regularSlice), len(h.regularMap), len(h.optimizedForward), cnt) 629 | } 630 | } 631 | 632 | // Decode - reconstructs the structure from a byte slice 633 | func Decode(b []byte) (h *HtEvSt, e error) { 634 | r := serialize.NewBitStreamOpsReader(b) 635 | h = &HtEvSt{} 636 | var temp uint 637 | var tempb []byte 638 | remaining := len(b) 639 | /// read length of filter bytes 640 | if temp, e = r.Collect(32); e != nil { 641 | return nil, fmt.Errorf("Cannot read length of filter [%s]", e.Error()) 642 | } 643 | remaining -= int(temp) + 4 644 | if LogEnabled { 645 | fmt.Printf("Detaching %d bytes for filter data -- remains %d\n", int(temp), remaining) 646 | } 647 | /// detach (yeah, that's what this function should be called) encoded filter bytes 648 | if tempb, e = r.DeAppend(int(temp)); e != nil { 649 | return nil, fmt.Errorf("Cannot read filter bytes [%s]", e.Error()) 650 | } 651 | /// decode filter 652 | if h.filter, e = cuckoofilter.Decode(tempb); e != nil { 653 | return nil, fmt.Errorf("Cannot decode filter [%s]", e.Error()) 654 | } 655 | /// read length of regular slice bytes 656 | if temp, e = r.Collect(32); e != nil { 657 | return nil, fmt.Errorf("Cannot read length of regular slice [%s]", e.Error()) 658 | } 659 | /// detach encoded regular slice bytes 660 | remaining -= int(temp) + 4 661 | if LogEnabled { 662 | fmt.Printf("Detaching %d bytes for reg slice data -- remains %d\n", int(temp), remaining) 663 | } 664 | if tempb, e = r.DeAppend(int(temp)); e != nil { 665 | return nil, fmt.Errorf("Cannot read regular slice bytes [%s]", e.Error()) 666 | } 667 | /// decode regular slice 668 | if h.regularSlice, e = decodeRegularSlice(tempb); e != nil { 669 | return nil, fmt.Errorf("Cannot decode regular slice [%s]", e.Error()) 670 | } 671 | /// read length of regular map bytes 672 | if temp, e = r.Collect(32); e != nil { 673 | return nil, fmt.Errorf("Cannot read length of regular map [%s]", e.Error()) 674 | } 675 | /// detach encoded regular map bytes 676 | remaining -= int(temp) + 4 677 | if LogEnabled { 678 | fmt.Printf("Detaching %d bytes for reg map data -- remains %d\n", int(temp), remaining) 679 | } 680 | if tempb, e = r.DeAppend(int(temp)); e != nil { 681 | return nil, fmt.Errorf("Cannot read regular map bytes [%s]", e.Error()) 682 | } 683 | /// decode regular map 684 | if h.regularMap, e = decodeRegularMap(tempb); e != nil { 685 | return nil, fmt.Errorf("Cannot decode regular map [%s]", e.Error()) 686 | } 687 | /// read length of forward map bytes 688 | if temp, e = r.Collect(32); e != nil { 689 | return nil, fmt.Errorf("Cannot read length of forward map [%s]", e.Error()) 690 | } 691 | remaining -= int(temp) + 4 692 | if LogEnabled { 693 | fmt.Printf("Detaching %d bytes for fwd map data -- remains %d\n", int(temp), remaining) 694 | } 695 | /// detach encoded forward map bytes 696 | if tempb, e = r.DeAppend(int(temp)); e != nil { 697 | return nil, fmt.Errorf("Cannot read forward map bytes [%s]", e.Error()) 698 | } 699 | /// decode forward map 700 | if h.optimizedForward, e = decodeForwardMap(tempb); e != nil { 701 | return nil, fmt.Errorf("Cannot decode forward map [%s]", e.Error()) 702 | } 703 | 704 | return 705 | } 706 | 707 | // EncodeToPath - encodes internal structure into a byte slice, and flushes it to disk 708 | func (h *HtEvSt) EncodeToPath(outFile string) (b []byte, e error) { 709 | b, e = h.Encode() 710 | if e != nil { 711 | return nil, e 712 | } 713 | bb := new(bytes.Buffer) 714 | bb.Write(b) 715 | ioutil.WriteFile(outFile, b, 0755) 716 | return b, e 717 | } 718 | 719 | // Encode - encodes internal structure 720 | func (h *HtEvSt) Encode() (ret []byte, e error) { 721 | e = nil 722 | var t []byte 723 | sumBytes := 0 724 | b := serialize.NewBitStreamOps() 725 | /// first encode the filter (with leading numbytes) 726 | t = h.filter.Encode() 727 | b.Emit(uint(len(t)), 32) 728 | b.Append(t) 729 | sumBytes += len(t) + 4 730 | /// next encode the regular slice (with leading numbytes) 731 | if t, e = encodeRegularSlice(h.regularSlice); e != nil { 732 | return nil, fmt.Errorf("encode regular slice error [%s]", e.Error()) 733 | } 734 | b.Emit(uint(len(t)), 32) 735 | b.Append(t) 736 | sumBytes += len(t) + 4 737 | /// follows encoding of regular map 738 | if t, e = encodeRegularMap(h); e != nil { 739 | return nil, fmt.Errorf("encode regular map error [%s]", e.Error()) 740 | } 741 | b.Emit(uint(len(t)), 32) 742 | b.Append(t) 743 | sumBytes += len(t) 744 | /// follows encoding of forward map 745 | if t, e = encodeForwardMap(h); e != nil { 746 | return nil, fmt.Errorf("encode forward map error [%s]", e.Error()) 747 | } 748 | 749 | b.Emit(uint(len(t)), 32) 750 | b.Append(t) 751 | sumBytes += len(t) + 4 752 | 753 | if LogEnabled { 754 | fmt.Printf("The encode buffer is [%d] bytes long.\n", len(b.Buffer())) 755 | } 756 | 757 | return b.Buffer(), nil 758 | } 759 | 760 | /// calculate the hash of the target (used in encoding) 761 | /// uses 64bit hash, because 32bit has 1 collision 762 | /// le: could move the one collsion to forward table, and save some space at encoding... 763 | func hash(s string) uint32 { 764 | h := fnv.New32() 765 | h.Write([]byte(s)) 766 | return h.Sum32() 767 | } 768 | 769 | /// when a scattered target is found which already has an entry in the map, a reatroactive rule comassing and entry rewrite is necessary 770 | /// and since the ulterior simplification of rulesets will be made from the input slice, we allocate the new super-rules there 771 | func (r *RulesetSt) retroactiveJoin(s *RulesetSt, hm map[string]*RulesetSt, data *HtEvSt) { 772 | uniqRules := make(map[string]RuleSt) 773 | uniqTargets := make(map[string]TargetSt) 774 | uniqExclusions := make(map[string]ExclusionSt) 775 | a := []*RulesetSt{r, s} 776 | /// join slices 777 | for _, ruleset := range a { 778 | for _, t := range ruleset.Target { 779 | uniqTargets[t.Host] = t 780 | } 781 | for _, r := range ruleset.Rule { 782 | uniqRules[r.From+"THIS!!!IS@@@JUST###A$$$TEXT"+r.To] = r 783 | } 784 | for _, e := range ruleset.Exclusion { 785 | uniqExclusions[e.Pattern] = e 786 | } 787 | } 788 | 789 | //superRuleset := &RulesetSt{Name: r.Name + "-REDUX", Disabled: r.Disabled, Target: make([]TargetSt, 0), Rule: make([]RuleSt, 0), Exclusion: make([]ExclusionSt, 0)} 790 | superRuleset := data.newRulesetSt() 791 | superRuleset.Name = r.Name + "-REDUX" 792 | superRuleset.Disabled = r.Disabled 793 | superRuleset.Target = make([]TargetSt, 0) 794 | superRuleset.Rule = make([]RuleSt, 0) 795 | superRuleset.Exclusion = make([]ExclusionSt, 0) 796 | 797 | for _, t := range uniqTargets { 798 | superRuleset.Target = append(superRuleset.Target, t) 799 | } 800 | for _, r := range uniqRules { 801 | superRuleset.Rule = append(superRuleset.Rule, r) 802 | } 803 | for _, e := range uniqExclusions { 804 | superRuleset.Exclusion = append(superRuleset.Exclusion, e) 805 | } 806 | 807 | /// at this point we have a brand new combined power-rangers-like super rule. yay. 808 | /// follows rewriting the map for all entries 809 | 810 | for _, t := range superRuleset.Target { 811 | hm[t.Host] = superRuleset 812 | } 813 | /// propagate the change back to the origin; this helps in persisting the change back to forward table (if applies). 814 | *r = *superRuleset 815 | } 816 | 817 | // Parse - reads rule xml files and constructs their in-memory representation 818 | func Parse(RulePath string) (*HtEvSt, error) { 819 | list, err := ioutil.ReadDir(RulePath) 820 | if err != nil { 821 | return nil, fmt.Errorf("error reading dir. [%s]", err.Error()) 822 | } 823 | 824 | data := new(HtEvSt) 825 | data.input = make([]*RulesetSt, 0) 826 | data.filter = cuckoofilter.NewDefaultCuckooFilter() 827 | data.forward = make(map[string][]*RulesetSt) 828 | data.regularMap = make(map[uint32][]int) 829 | data.regularSlice = make([]*SimplifiedRulesetSt, 0) 830 | inputNum := 0 831 | regularNum := 0 832 | inputStrlen := 0 833 | trickyNum := 0 834 | 835 | test := make(map[string][]*RulesetSt) 836 | 837 | for _, entry := range list { 838 | if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".xml") { 839 | continue 840 | 841 | } 842 | 843 | xmldata, err := ioutil.ReadFile(RulePath + "/" + entry.Name()) 844 | if err != nil { 845 | return nil, fmt.Errorf("error reading file. [%s]", entry.Name()) 846 | 847 | } 848 | 849 | res := data.newRulesetSt() 850 | if err := xml.Unmarshal(xmldata, &res); err != nil { 851 | if LogEnabled { 852 | fmt.Printf("Error occured in file [%s] :: [%s]\n", entry.Name(), err.Error()) 853 | } 854 | continue 855 | } 856 | 857 | /// use only valid rulesets 858 | if res.Disabled == "" && res.Platform != "mixedcontent" && res.Platform != "cacert" { 859 | /// collect some data to brag about 860 | inputNum += len(res.Target) 861 | for _, rule := range res.Rule { 862 | inputStrlen += len(rule.From) + len(rule.To) 863 | } 864 | for _, excl := range res.Exclusion { 865 | inputStrlen += len(excl.Pattern) 866 | } 867 | for _, t := range res.Target { 868 | inputStrlen += len(t.Host) 869 | /// move support for tricky wildcards to a future time, fortunately the number of these edge cases is 0 870 | if strings.Contains(t.Host, "*.*") { 871 | trickyNum++ 872 | continue 873 | } 874 | /// when a wildcard is not on one of the extremes of the string, save it in a straightforward map 875 | if strings.Count(t.Host, "*") >= 1 && !strings.HasPrefix(t.Host, "*") && !strings.HasSuffix(t.Host, "*") { 876 | if LogEnabled { 877 | fmt.Printf(">>[%s] SAVED IN FORWARD\n", t.Host) 878 | } 879 | if data.forward[t.Host] == nil { 880 | data.forward[t.Host] = make([]*RulesetSt, 0) 881 | } 882 | data.forward[t.Host] = append(data.forward[t.Host], res) 883 | continue 884 | } 885 | /// if the pattern is not spectacular in any way, save it in the filter 886 | data.filter.InsertUnique([]byte(t.Host)) 887 | regularNum++ 888 | /// check for target already in map, and do the grand unification scheme if so 889 | /// iced the unification scheme, it breaks some rules 890 | if test[t.Host] == nil { 891 | test[t.Host] = make([]*RulesetSt, 0) 892 | } 893 | test[t.Host] = append(test[t.Host], res) 894 | } 895 | } else { 896 | // if logEnabled { 897 | // fmt.Printf("Disabled rule [%s] cause [%s]\n", res.Name, res.Disabled) 898 | // } 899 | } 900 | 901 | } 902 | 903 | /// saves the unique indexes here 904 | encodeMap = make(map[int]int) 905 | /// have a map of hashes->target strings to decide if there's an actual collision or just a duplicate target entry (since the rule joining technique is dissolved) 906 | /// for now, the logic is first arrived, first served; in time will evacuate forward if more duplicates are there than not (right now 1 hash collision is present, 2017.09.12) 907 | collisionChecker := make(map[uint32]string) 908 | savedDefaultRule := false 909 | defaultRuleIndex := -1 910 | for target, objArr := range test { 911 | /// approach here is, that we calculate the hash for the target, we check if it's a _legit_ collision (different target producing same hash value) 912 | hc := hash(target) 913 | /// if there's a collision, move the target to forward table, and save with the larger structure (will be simplified later) 914 | if _, contains := data.regularMap[hc]; contains && collisionChecker[hc] != target { 915 | if LogEnabled { 916 | fmt.Printf("Collision detected --> moving target [%s] to forward table.\n", target) 917 | } 918 | data.forward[target] = objArr 919 | continue 920 | } 921 | /// register hash in collision checker 922 | collisionChecker[hc] = target 923 | /// if that specific struct was already saved, save the reference in the current target too 924 | if data.regularMap[hc] == nil { 925 | data.regularMap[hc] = make([]int, 0) 926 | } 927 | for _, obj := range objArr { 928 | if index, contains := encodeMap[obj.Index]; contains { 929 | data.regularMap[hc] = append(data.regularMap[hc], index) 930 | } else { 931 | 932 | simple := obj.simplify() 933 | if simple.isDefaultRule() && savedDefaultRule { 934 | data.regularMap[hc] = append(data.regularMap[hc], defaultRuleIndex) 935 | encodeMap[obj.Index] = defaultRuleIndex 936 | continue 937 | } else if simple.isDefaultRule() { 938 | defaultRuleIndex = len(data.regularSlice) 939 | savedDefaultRule = true 940 | } 941 | 942 | if len(simple.ruleFrom) != len(simple.ruleTo) { 943 | if LogEnabled { 944 | fmt.Printf("BIG PROBLEM! [%s]\n[%s]\n", simple, obj) 945 | } 946 | } 947 | data.regularSlice = append(data.regularSlice, simple) 948 | currInd := len(data.regularSlice) - 1 949 | data.regularMap[hc] = append(data.regularMap[hc], currInd) 950 | encodeMap[obj.Index] = currInd 951 | } 952 | } 953 | } 954 | 955 | /// bragging and testing section 956 | if LogEnabled { 957 | fmt.Printf("Read [%d] entries, with [%d] targets grand total, [%d] total characters, and [%d] tricky wildcards\n", len(data.input), inputNum, inputStrlen, trickyNum) 958 | } 959 | start := time.Now() 960 | for _, e := range data.input { 961 | for _, t := range e.Target { 962 | b := data.filter.Lookup([]byte(t.Host)) 963 | if b != true { 964 | } 965 | } 966 | } 967 | if LogEnabled { 968 | fmt.Printf("Checked entries in [%v] time\n", time.Now().Sub(start)) 969 | fmt.Printf("Filter takes around [%d] space.\n", len(data.filter.Encode())) 970 | fmt.Printf("Hashmap stats %d vs %d vs %d\n", len(data.regularMap), len(test), len(data.regularSlice)) 971 | } 972 | 973 | totalcharsAgain := 0 974 | for _, r := range data.regularSlice { 975 | totalcharsAgain += r.countChars() 976 | } 977 | bitnum := int(math.Ceil(math.Log2(float64(len(data.regularSlice))))) 978 | if LogEnabled { 979 | fmt.Printf("And again just to double check, [%d] is the total number of characters. [%d] entries --> [%d] bits to encode indexes\n", totalcharsAgain, len(data.regularSlice), bitnum) 980 | fmt.Printf("Map will approximately take [%d] bytes to encode.\n", 4*(len(data.regularMap)+len(data.regularSlice)+1)) 981 | } 982 | 983 | return data, nil 984 | } 985 | --------------------------------------------------------------------------------