├── .gitignore ├── requirements.txt ├── assets └── output.gif ├── app.py ├── get_tokens.py ├── LICENSE.md ├── README.md └── agent.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | frida==16.1.4 2 | xpcspy==0.8.3 -------------------------------------------------------------------------------- /assets/output.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackcook/predictive-spy/HEAD/assets/output.gif -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from frida_tools.application import ConsoleApplication 4 | from xpcspy.lib.types import Filter 5 | 6 | from agent import Agent 7 | 8 | 9 | class XPCSpyApplication(ConsoleApplication): 10 | def _usage(self): 11 | return "%(prog)s" 12 | 13 | def _needs_target(self): 14 | return True 15 | 16 | def _initialize(self, parser, options, args): 17 | self._filter = Filter.from_str("o:*") 18 | self._should_parse = True 19 | self._print_timestamp = False 20 | self._target = ("name", "AppleSpell") 21 | 22 | def _start(self): 23 | agent = Agent( 24 | self._filter, 25 | self._should_parse, 26 | self._session, 27 | self._reactor, 28 | self._print_timestamp, 29 | ) 30 | agent.start_hooking(self) 31 | 32 | 33 | if __name__ == "__main__": 34 | if len(sys.argv) == 1: 35 | sys.argv.append("AppleSpell") 36 | 37 | app = XPCSpyApplication() 38 | app.run() 39 | -------------------------------------------------------------------------------- /get_tokens.py: -------------------------------------------------------------------------------- 1 | # Read binary data from sp.dat 2 | data = open( 3 | "/System/Library/LinguisticData/RequiredAssets_en.bundle/AssetData/en.lm/unilm.bundle/sp.dat", 4 | "rb", 5 | ).read() 6 | 7 | # Find the token, which is the first token in the vocab 8 | first_token_offset = data.find(b"", data.find(b"") + 1) 9 | 10 | if first_token_offset == -1: 11 | raise Exception( 12 | "Could not find token. You may need to update to macOS Sonoma." 13 | ) 14 | 15 | # Parse the tokens 16 | tokens = [] 17 | current_token = b"" 18 | 19 | for byte in range(first_token_offset, len(data)): 20 | # Tokens are split by null bytes 21 | if data[byte] == 0: 22 | tokens.append(current_token.decode("utf-8")) 23 | current_token = b"" 24 | 25 | if len(tokens) == 15000: 26 | break 27 | else: 28 | current_token += bytes([data[byte]]) 29 | 30 | # Write all tokens to vocab.txt 31 | with open("vocab.txt", "w") as f: 32 | for i, token in enumerate(tokens): 33 | f.write(token) 34 | 35 | if i != len(tokens) - 1: 36 | f.write("\n") 37 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 Jack Cook 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Predictive Spy 🕵️ 2 | 3 | Code accompanying my blogpost, “[A look at Apple’s new Transformer-powered predictive text model.](https://jackcook.com/2023/09/08/predictive-text.html)” 4 | With this repository, you can snoop on activity from the new predictive text model in macOS Sonoma. 5 | 6 | **Note:** At some point this summer, Apple removed the ability to spy on model predictions at some point, but I’m not sure which beta they did this in. 7 | I can confirm it works in macOS Sonoma beta 1, but not in beta 7. 8 | 9 | Demo snooping on predictive text model predictions 10 | 11 | ## Introduction 12 | 13 | This repository has two scripts: 14 | 15 | - **get_tokens.py**: Generates a vocabulary file from the predictive text model 16 | - **app.py**: Spies on predictive text model activity 17 | 18 | Both scripts only work on macOS Sonoma (14), neither will work on macOS Ventura (13) or earlier. 19 | If you’re just interested in getting the vocabulary file, you don’t need to follow any of the setup instructions. 20 | 21 | ## Spying Setup 22 | 23 | **Note:** I tested these instructions most recently on a virtual machine in Parallels, but these instructions should also work on a real machine. 24 | If you need to install a VM, I found a link to a macOS Sonoma beta 1 IPSW [here](https://ipswbeta.dev/macos/14.x/). 25 | 26 | ### Disable SIP 27 | 28 | Follow [this guide](https://developer.apple.com/documentation/security/disabling_and_enabling_system_integrity_protection) to disable system integrity protection. 29 | You’ll need to boot into recovery mode, run a command, and then reboot. 30 | If you’re doing this on a real machine, don’t forget to re-enable SIP once you’re done :-) 31 | 32 | ### Install Command Line Tools 33 | 34 | Usually, you should be able to install command line tools with the following command: 35 | 36 | ```bash 37 | xcode-select --install 38 | ``` 39 | 40 | However, I had trouble doing this in my VM, so I downloaded the most recent Command Line Tools package from the Apple Developer website. 41 | 42 | ### Install fq 43 | 44 | To install [`fq`](https://github.com/wader/fq), you can follow the instructions in their README, or install with Homebrew: 45 | 46 | ```bash 47 | brew install wader/tap/fq 48 | ``` 49 | 50 | ### Install dependencies 51 | 52 | ```bash 53 | pip3 install -r requirements.txt 54 | ``` 55 | 56 | ## Usage 57 | 58 | Once everything is set up, you should be able to run it with sudo: 59 | 60 | ```bash 61 | sudo python3 app.py 62 | ``` 63 | 64 | You may see the following error: 65 | 66 | ``` 67 | Failed to spawn: unable to find a process with name 'AppleSpell' 68 | ``` 69 | 70 | This is because AppleSpell needs to be running when you start the command. 71 | In order to ensure this is the case, open the Notes app (or any other app with a text field) and start typing, then try starting the script again. 72 | 73 | ## License 74 | 75 | `predictive-spy` is available under the MIT license. See the [LICENSE](LICENSE.md) file for more details. 76 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | from collections import OrderedDict 3 | 4 | import xpcspy 5 | from xpcspy.lib.types import Event 6 | import subprocess 7 | import json 8 | import traceback 9 | 10 | 11 | def convert_value(value): 12 | if value["type"] == "ascii_string": 13 | return value["value"] 14 | elif value["type"] == "unicode_string": 15 | return value["value"] 16 | elif value["type"] == "int": 17 | return value["value"] 18 | elif value["type"] == "uid": 19 | return value["value"] 20 | elif value["type"] == "dict": 21 | return convert_dict(value["entries"]) 22 | elif value["type"] == "array": 23 | return convert_array(value["entries"]) 24 | else: 25 | return json.dumps(value) 26 | 27 | 28 | def convert_dict(obj): 29 | return {entry["key"]["value"]: convert_value(entry["value"]) for entry in obj} 30 | 31 | 32 | def convert_array(obj): 33 | return [convert_value(val) for val in obj] 34 | 35 | 36 | class Agent: 37 | def __init__(self, filter, should_parse, session, reactor, print_timestamp=False): 38 | """ 39 | Initialize the Frida agent 40 | """ 41 | self._pending_events = ( 42 | OrderedDict() 43 | ) # A map of stacks, each stack holding events for that particular timestamp 44 | self._filter = filter 45 | self._should_parse = should_parse 46 | self._print_timestamp = print_timestamp 47 | self._script_path = path.join(path.dirname(xpcspy.__file__), "..", "_agent.js") 48 | with open(self._script_path) as src_f: 49 | script_src = src_f.read() 50 | self._script = session.create_script(script_src) 51 | self._reactor = reactor 52 | self._agent = None 53 | 54 | def start_hooking(self, ui): 55 | def on_message(message, data): 56 | self._reactor.schedule(lambda: self._on_message(message, data, ui)) 57 | 58 | self._script.on("message", on_message) 59 | self._script.load() 60 | ui._update_status("Installing hooks...") 61 | self._agent = self._script.exports 62 | self._agent.install_hooks(self._filter, self._should_parse) 63 | 64 | def _on_message(self, message, data, ui): 65 | mtype = message["payload"]["type"] 66 | 67 | if mtype == "agent:hooks_installed": 68 | ui._update_status("Hooks installed, intercepting messages...") 69 | ui._resume() 70 | elif mtype == "agent:trace:symbol": 71 | symbol = message["payload"]["message"]["symbol"] 72 | timestamp = message["payload"]["message"]["timestamp"] 73 | if timestamp in self._pending_events: 74 | self._pending_events[timestamp].append(Event(symbol)) 75 | else: 76 | self._pending_events.update({timestamp: [Event(symbol)]}) 77 | elif mtype == "agent:trace:data": 78 | timestamp = message["payload"]["message"]["timestamp"] 79 | data = message["payload"]["message"]["data"] 80 | self._pending_events[timestamp][-1].data = data 81 | else: 82 | ui._print(f"Unhandled message {message}") 83 | 84 | self.flush_pending_events(ui) 85 | 86 | def flush_pending_events(self, ui): 87 | """Flush pending events that are ready, i.e. have received both its symbol and data""" 88 | for ts, events_stack in list(self._pending_events.items()): 89 | while len(events_stack) > 0: 90 | last_event = events_stack[-1] # Peek 91 | 92 | if last_event.data == None: 93 | return 94 | 95 | for line in last_event.data["message"].splitlines(): 96 | if "<62706c69" in line: 97 | encoded_bplist = line[ 98 | line.index("<") + 1 : line.index(">", -1) 99 | ].replace(" ", "") 100 | cmd = f"echo {encoded_bplist} | xxd -r -p | fq d -V" 101 | decoded_bplist = subprocess.check_output( 102 | cmd, shell=True 103 | ).decode("utf-8") 104 | payload = json.loads(decoded_bplist) 105 | 106 | print(payload) 107 | 108 | data = convert_array( 109 | payload["objects"]["entries"][3]["value"]["entries"] 110 | ) 111 | indices = data[1] 112 | 113 | if len(indices["NS.objects"]) == 0: 114 | continue 115 | else: 116 | indices = indices["NS.objects"] 117 | 118 | print("-" * 40) 119 | lines_printed = 0 120 | 121 | for i in indices: 122 | try: 123 | if data[i]["$class"] in [4, 10, 12]: 124 | replacement_str = data[ 125 | data[i]["NSReplacementString"] 126 | ] 127 | promoted = "NSIsPromoted" in data[i] 128 | 129 | if promoted: 130 | print(f"*** {replacement_str} ***") 131 | else: 132 | print(replacement_str) 133 | elif data[i]["$class"] == 6: 134 | replacement_str = data[ 135 | data[i]["NSReplacementString"] 136 | ] 137 | print(f"*** {replacement_str} ***") 138 | else: 139 | continue 140 | 141 | lines_printed += 1 142 | except: 143 | print(traceback.format_exc()) 144 | print(data) 145 | 146 | events_stack.pop() 147 | del self._pending_events[ts] 148 | --------------------------------------------------------------------------------