├── .gitignore
├── requirements.txt
├── assets
    └── output.gif
├── app.py
├── get_tokens.py
├── LICENSE.md
├── README.md
└── agent.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | frida==16.1.4
2 | xpcspy==0.8.3


--------------------------------------------------------------------------------
/assets/output.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackcook/predictive-spy/HEAD/assets/output.gif


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from frida_tools.application import ConsoleApplication
 4 | from xpcspy.lib.types import Filter
 5 | 
 6 | from agent import Agent
 7 | 
 8 | 
 9 | class XPCSpyApplication(ConsoleApplication):
10 |     def _usage(self):
11 |         return "%(prog)s"
12 | 
13 |     def _needs_target(self):
14 |         return True
15 | 
16 |     def _initialize(self, parser, options, args):
17 |         self._filter = Filter.from_str("o:*")
18 |         self._should_parse = True
19 |         self._print_timestamp = False
20 |         self._target = ("name", "AppleSpell")
21 | 
22 |     def _start(self):
23 |         agent = Agent(
24 |             self._filter,
25 |             self._should_parse,
26 |             self._session,
27 |             self._reactor,
28 |             self._print_timestamp,
29 |         )
30 |         agent.start_hooking(self)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     if len(sys.argv) == 1:
35 |         sys.argv.append("AppleSpell")
36 | 
37 |     app = XPCSpyApplication()
38 |     app.run()
39 | 


--------------------------------------------------------------------------------
/get_tokens.py:
--------------------------------------------------------------------------------
 1 | # Read binary data from sp.dat
 2 | data = open(
 3 |     "/System/Library/LinguisticData/RequiredAssets_en.bundle/AssetData/en.lm/unilm.bundle/sp.dat",
 4 |     "rb",
 5 | ).read()
 6 | 
 7 | # Find the <pad> token, which is the first token in the vocab
 8 | first_token_offset = data.find(b"<pad>", data.find(b"<pad>") + 1)
 9 | 
10 | if first_token_offset == -1:
11 |     raise Exception(
12 |         "Could not find <pad> token. You may need to update to macOS Sonoma."
13 |     )
14 | 
15 | # Parse the tokens
16 | tokens = []
17 | current_token = b""
18 | 
19 | for byte in range(first_token_offset, len(data)):
20 |     # Tokens are split by null bytes
21 |     if data[byte] == 0:
22 |         tokens.append(current_token.decode("utf-8"))
23 |         current_token = b""
24 | 
25 |         if len(tokens) == 15000:
26 |             break
27 |     else:
28 |         current_token += bytes([data[byte]])
29 | 
30 | # Write all tokens to vocab.txt
31 | with open("vocab.txt", "w") as f:
32 |     for i, token in enumerate(tokens):
33 |         f.write(token)
34 | 
35 |         if i != len(tokens) - 1:
36 |             f.write("\n")
37 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2023 Jack Cook
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Predictive Spy 🕵️
 2 | 
 3 | Code accompanying my blogpost, “[A look at Apple’s new Transformer-powered predictive text model.](https://jackcook.com/2023/09/08/predictive-text.html)”
 4 | With this repository, you can snoop on activity from the new predictive text model in macOS Sonoma.
 5 | 
 6 | **Note:** At some point this summer, Apple removed the ability to spy on model predictions at some point, but I’m not sure which beta they did this in.
 7 | I can confirm it works in macOS Sonoma beta 1, but not in beta 7.
 8 | 
 9 | <img src="/assets/output.gif" width="75%" style="margin: 0 auto" alt="Demo snooping on predictive text model predictions" />
10 | 
11 | ## Introduction
12 | 
13 | This repository has two scripts:
14 | 
15 | - **get_tokens.py**: Generates a vocabulary file from the predictive text model
16 | - **app.py**: Spies on predictive text model activity
17 | 
18 | Both scripts only work on macOS Sonoma (14), neither will work on macOS Ventura (13) or earlier.
19 | If you’re just interested in getting the vocabulary file, you don’t need to follow any of the setup instructions.
20 | 
21 | ## Spying Setup
22 | 
23 | **Note:** I tested these instructions most recently on a virtual machine in Parallels, but these instructions should also work on a real machine.
24 | If you need to install a VM, I found a link to a macOS Sonoma beta 1 IPSW [here](https://ipswbeta.dev/macos/14.x/).
25 | 
26 | ### Disable SIP
27 | 
28 | Follow [this guide](https://developer.apple.com/documentation/security/disabling_and_enabling_system_integrity_protection) to disable system integrity protection.
29 | You’ll need to boot into recovery mode, run a command, and then reboot.
30 | If you’re doing this on a real machine, don’t forget to re-enable SIP once you’re done :-)
31 | 
32 | ### Install Command Line Tools
33 | 
34 | Usually, you should be able to install command line tools with the following command:
35 | 
36 | ```bash
37 | xcode-select --install
38 | ```
39 | 
40 | However, I had trouble doing this in my VM, so I downloaded the most recent Command Line Tools package from the Apple Developer website.
41 | 
42 | ### Install fq
43 | 
44 | To install [`fq`](https://github.com/wader/fq), you can follow the instructions in their README, or install with Homebrew:
45 | 
46 | ```bash
47 | brew install wader/tap/fq
48 | ```
49 | 
50 | ### Install dependencies
51 | 
52 | ```bash
53 | pip3 install -r requirements.txt
54 | ```
55 | 
56 | ## Usage
57 | 
58 | Once everything is set up, you should be able to run it with sudo:
59 | 
60 | ```bash
61 | sudo python3 app.py
62 | ```
63 | 
64 | You may see the following error:
65 | 
66 | ```
67 | Failed to spawn: unable to find a process with name 'AppleSpell'
68 | ```
69 | 
70 | This is because AppleSpell needs to be running when you start the command.
71 | In order to ensure this is the case, open the Notes app (or any other app with a text field) and start typing, then try starting the script again.
72 | 
73 | ## License
74 | 
75 | `predictive-spy` is available under the MIT license. See the [LICENSE](LICENSE.md) file for more details.
76 | 


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
  1 | from os import path
  2 | from collections import OrderedDict
  3 | 
  4 | import xpcspy
  5 | from xpcspy.lib.types import Event
  6 | import subprocess
  7 | import json
  8 | import traceback
  9 | 
 10 | 
 11 | def convert_value(value):
 12 |     if value["type"] == "ascii_string":
 13 |         return value["value"]
 14 |     elif value["type"] == "unicode_string":
 15 |         return value["value"]
 16 |     elif value["type"] == "int":
 17 |         return value["value"]
 18 |     elif value["type"] == "uid":
 19 |         return value["value"]
 20 |     elif value["type"] == "dict":
 21 |         return convert_dict(value["entries"])
 22 |     elif value["type"] == "array":
 23 |         return convert_array(value["entries"])
 24 |     else:
 25 |         return json.dumps(value)
 26 | 
 27 | 
 28 | def convert_dict(obj):
 29 |     return {entry["key"]["value"]: convert_value(entry["value"]) for entry in obj}
 30 | 
 31 | 
 32 | def convert_array(obj):
 33 |     return [convert_value(val) for val in obj]
 34 | 
 35 | 
 36 | class Agent:
 37 |     def __init__(self, filter, should_parse, session, reactor, print_timestamp=False):
 38 |         """
 39 |         Initialize the Frida agent
 40 |         """
 41 |         self._pending_events = (
 42 |             OrderedDict()
 43 |         )  # A map of stacks, each stack holding events for that particular timestamp
 44 |         self._filter = filter
 45 |         self._should_parse = should_parse
 46 |         self._print_timestamp = print_timestamp
 47 |         self._script_path = path.join(path.dirname(xpcspy.__file__), "..", "_agent.js")
 48 |         with open(self._script_path) as src_f:
 49 |             script_src = src_f.read()
 50 |         self._script = session.create_script(script_src)
 51 |         self._reactor = reactor
 52 |         self._agent = None
 53 | 
 54 |     def start_hooking(self, ui):
 55 |         def on_message(message, data):
 56 |             self._reactor.schedule(lambda: self._on_message(message, data, ui))
 57 | 
 58 |         self._script.on("message", on_message)
 59 |         self._script.load()
 60 |         ui._update_status("Installing hooks...")
 61 |         self._agent = self._script.exports
 62 |         self._agent.install_hooks(self._filter, self._should_parse)
 63 | 
 64 |     def _on_message(self, message, data, ui):
 65 |         mtype = message["payload"]["type"]
 66 | 
 67 |         if mtype == "agent:hooks_installed":
 68 |             ui._update_status("Hooks installed, intercepting messages...")
 69 |             ui._resume()
 70 |         elif mtype == "agent:trace:symbol":
 71 |             symbol = message["payload"]["message"]["symbol"]
 72 |             timestamp = message["payload"]["message"]["timestamp"]
 73 |             if timestamp in self._pending_events:
 74 |                 self._pending_events[timestamp].append(Event(symbol))
 75 |             else:
 76 |                 self._pending_events.update({timestamp: [Event(symbol)]})
 77 |         elif mtype == "agent:trace:data":
 78 |             timestamp = message["payload"]["message"]["timestamp"]
 79 |             data = message["payload"]["message"]["data"]
 80 |             self._pending_events[timestamp][-1].data = data
 81 |         else:
 82 |             ui._print(f"Unhandled message {message}")
 83 | 
 84 |         self.flush_pending_events(ui)
 85 | 
 86 |     def flush_pending_events(self, ui):
 87 |         """Flush pending events that are ready, i.e. have received both its symbol and data"""
 88 |         for ts, events_stack in list(self._pending_events.items()):
 89 |             while len(events_stack) > 0:
 90 |                 last_event = events_stack[-1]  # Peek
 91 | 
 92 |                 if last_event.data == None:
 93 |                     return
 94 | 
 95 |                 for line in last_event.data["message"].splitlines():
 96 |                     if "<62706c69" in line:
 97 |                         encoded_bplist = line[
 98 |                             line.index("<") + 1 : line.index(">", -1)
 99 |                         ].replace(" ", "")
100 |                         cmd = f"echo {encoded_bplist} | xxd -r -p | fq d -V"
101 |                         decoded_bplist = subprocess.check_output(
102 |                             cmd, shell=True
103 |                         ).decode("utf-8")
104 |                         payload = json.loads(decoded_bplist)
105 | 
106 |                         print(payload)
107 | 
108 |                         data = convert_array(
109 |                             payload["objects"]["entries"][3]["value"]["entries"]
110 |                         )
111 |                         indices = data[1]
112 | 
113 |                         if len(indices["NS.objects"]) == 0:
114 |                             continue
115 |                         else:
116 |                             indices = indices["NS.objects"]
117 | 
118 |                         print("-" * 40)
119 |                         lines_printed = 0
120 | 
121 |                         for i in indices:
122 |                             try:
123 |                                 if data[i]["$class"] in [4, 10, 12]:
124 |                                     replacement_str = data[
125 |                                         data[i]["NSReplacementString"]
126 |                                     ]
127 |                                     promoted = "NSIsPromoted" in data[i]
128 | 
129 |                                     if promoted:
130 |                                         print(f"*** {replacement_str} ***")
131 |                                     else:
132 |                                         print(replacement_str)
133 |                                 elif data[i]["$class"] == 6:
134 |                                     replacement_str = data[
135 |                                         data[i]["NSReplacementString"]
136 |                                     ]
137 |                                     print(f"*** {replacement_str} ***")
138 |                                 else:
139 |                                     continue
140 | 
141 |                                 lines_printed += 1
142 |                             except:
143 |                                 print(traceback.format_exc())
144 |                                 print(data)
145 | 
146 |                 events_stack.pop()
147 |             del self._pending_events[ts]
148 | 


--------------------------------------------------------------------------------